2023-02-24

プロダクト開発演習
テーマ: ライブドアニュースコーパスの文章分類 (LSTMとCNNの比較)
行ったこと: ライブドアニュースコーパスの文章分類を2つのモデル(LSTM・CNN)を用いて精度比較し、高精度なモデルを選定した。
参考としたサイト:

LSTMについて

CNNについて

●LSTM用データセットの準備

In [1]:

import os
from glob import glob
import pandas as pd
import linecache

categories = [name for name in os.listdir('text') if os.path.isdir("text/" +name)]
print(categories)

datasets = pd.DataFrame(columns=["title", "category"])
for cat in categories:
    path = "text/" + cat + "/*.txt"
    files = glob(path)
    for text_name in files:
        title = linecache.getline(text_name, 3)
        s = pd.Series([title, cat], index=datasets.columns)
        datasets = datasets.append(s, ignore_index=True)

['movie-enter', 'it-life-hack', 'kaden-channel', 'topic-news', 'livedoor-homme', 'peachy', 'sports-watch', 'dokujo-tsushin', 'smax']

●LSTMを用いた文章分類モデルの作成

In [2]:

import MeCab
import time

In [3]:

import MeCab
import re

tagger = MeCab.Tagger("-Owakati")

def make_wakati(sentence):
    sentence = tagger.parse(sentence)
    sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
    wakati = sentence.split(" ")
    wakati = list(filter(("").__ne__, wakati))
    return wakati

In [4]:

word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})

for title in datasets["title"]:
    wakati = make_wakati(title)
    for word in wakati:
        if word in word2index: continue
        word2index[word] = len(word2index)
print("vocab size : ", len(word2index))

vocab size :  12944

In [5]:

from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    wakati = make_wakati(sentence)
    return [word2index[w] for w in wakati]

def category2index(cat):
    return [cat2index[cat]]

index_datasets_title_tmp = []
index_datasets_category = []

# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for title, category in zip(datasets["title"], datasets["category"]):
  index_title = sentence2index(title)
  index_category = category2index(category)
  index_datasets_title_tmp.append(index_title)
  index_datasets_category.append(index_category)
  if max_len < len(index_title):
    max_len = len(index_title)

# 系列の長さを揃えるために短い系列にパディングを追加
# 後ろパディングだと正しく学習できなかったので、前パディング
index_datasets_title = []
for title in index_datasets_title_tmp:
  for i in range(max_len - len(title)):
    title.insert(0, 0) # 前パディング
#     title.append(0)　# 後ろパディング
  index_datasets_title.append(title)

train_x, test_x, train_y, test_y = train_test_split(index_datasets_title, index_datasets_category, train_size=0.7)

# データをバッチでまとめるための関数
def train2batch(title, category, batch_size=100):
  title_batch = []
  category_batch = []
  title_shuffle, category_shuffle = shuffle(title, category)
  for i in range(0, len(title), batch_size):
    title_batch.append(title_shuffle[i:i+batch_size])
    category_batch.append(category_shuffle[i:i+batch_size])
  return title_batch, category_batch

In [6]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, layer_number):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        # <pad>の単語IDが0なので、padding_idx=0としている
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # batch_first=Trueが大事！
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=layer_number, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        _, lstm_out = self.lstm(embeds)
        # lstm_out[0].size() = (1 × batch_size × hidden_dim)
        tag_space = self.hidden2tag(lstm_out[0][2])
        # tag_space.size() = (1 × batch_size × tagset_size)

        # (batch_size × tagset_size)にするためにsqueeze()する
        tag_scores = self.softmax(tag_space.squeeze())
        # tag_scores.size() = (batch_size × tagset_size)

        return tag_scores

EMBEDDING_DIM = 200
HIDDEN_DIM = 128
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)
LAYER_NUMBER = 3
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE, LAYER_NUMBER)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:

lstm_train_accuracy_list = []
lstm_test_accuracy_list = []

maxepoch = 20
for epoch in range(maxepoch):
    start_time = time.process_time()

    temp_train_acc = 0
    train_loss = 0
    train_acc = 0
    title_batch, category_batch = train2batch(train_x, train_y)
    for i in range(len(title_batch)):
        batch_loss = 0

        model.zero_grad()

        title_tensor = torch.tensor(title_batch[i])
        # category_tensor.size() = (batch_size × 1)なので、squeeze()
        category_tensor = torch.tensor(category_batch[i]).squeeze()

        out = model(title_tensor)

        batch_loss = loss_function(out, category_tensor)
        _, preds = torch.max(out, 1)
        batch_loss.backward()
        optimizer.step()

        train_loss += batch_loss.item()
        temp_train_acc += torch.sum(preds==category_tensor).item()
    train_acc = temp_train_acc / (len(title_batch)*100)

    temp_test_acc = 0
    test_loss = 0
    test_acc = 0
    test_num = len(test_x)
    a = 0
    with torch.no_grad():
        title_batch, category_batch = train2batch(test_x, test_y)

        for i in range(len(title_batch)):
            title_tensor = torch.tensor(title_batch[i])
            category_tensor = torch.tensor(category_batch[i]).squeeze()

            out = model(title_tensor)
            batch_loss=loss_function(out, category_tensor)
            _, preds = torch.max(out, 1)

            test_loss += batch_loss.item()
            temp_test_acc += torch.sum(preds==category_tensor).item()
    test_acc = temp_test_acc / (len(title_batch)*100)
    print("epoch", epoch, "\t" , "train loss: ", round(train_loss, 5), "\t" , "train acc: ", round(train_acc, 5), "\t" , "test loss:", round(test_loss, 5), "\t" , "test acc:", round(test_acc, 5))

    lstm_train_accuracy_list.append(train_acc)
    lstm_test_accuracy_list.append(test_acc)

    end_time = time.process_time()
    elapsed_time = end_time - start_time
    print("process time: ", round(elapsed_time, 0), "[s]")

print("done.")

<ipython-input-6-81593e8a8fc4>:26: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  tag_scores = self.softmax(tag_space.squeeze())

epoch 0 	 train loss:  101.45796 	 train acc:  0.28654 	 test loss: 36.7009 	 test acc: 0.41739
process time:  19.0 [s]
epoch 1 	 train loss:  64.87202 	 train acc:  0.56288 	 test loss: 27.30444 	 test acc: 0.56826
process time:  18.0 [s]
epoch 2 	 train loss:  40.17011 	 train acc:  0.73115 	 test loss: 24.36324 	 test acc: 0.62435
process time:  19.0 [s]
epoch 3 	 train loss:  23.96207 	 train acc:  0.84827 	 test loss: 24.78822 	 test acc: 0.6413
process time:  18.0 [s]
epoch 4 	 train loss:  13.94799 	 train acc:  0.91269 	 test loss: 25.57045 	 test acc: 0.6513
process time:  18.0 [s]
epoch 5 	 train loss:  6.92676 	 train acc:  0.95962 	 test loss: 27.38291 	 test acc: 0.65957
process time:  18.0 [s]
epoch 6 	 train loss:  4.59388 	 train acc:  0.96962 	 test loss: 28.50567 	 test acc: 0.65696
process time:  18.0 [s]
epoch 7 	 train loss:  2.26921 	 train acc:  0.98327 	 test loss: 30.072 	 test acc: 0.66
process time:  18.0 [s]
epoch 8 	 train loss:  1.30224 	 train acc:  0.98788 	 test loss: 31.47531 	 test acc: 0.66174
process time:  18.0 [s]
epoch 9 	 train loss:  0.84991 	 train acc:  0.98962 	 test loss: 31.78433 	 test acc: 0.6613
process time:  18.0 [s]
epoch 10 	 train loss:  0.70333 	 train acc:  0.99 	 test loss: 32.47073 	 test acc: 0.65957
process time:  18.0 [s]
epoch 11 	 train loss:  0.54112 	 train acc:  0.99058 	 test loss: 33.76472 	 test acc: 0.66478
process time:  18.0 [s]
epoch 12 	 train loss:  0.39043 	 train acc:  0.99096 	 test loss: 34.48341 	 test acc: 0.66696
process time:  18.0 [s]
epoch 13 	 train loss:  0.3547 	 train acc:  0.99115 	 test loss: 35.85314 	 test acc: 0.66565
process time:  18.0 [s]
epoch 14 	 train loss:  0.33564 	 train acc:  0.99096 	 test loss: 34.98831 	 test acc: 0.66435
process time:  18.0 [s]
epoch 15 	 train loss:  0.31613 	 train acc:  0.99115 	 test loss: 35.73679 	 test acc: 0.66043
process time:  18.0 [s]
epoch 16 	 train loss:  0.27396 	 train acc:  0.99115 	 test loss: 35.56435 	 test acc: 0.66565
process time:  18.0 [s]
epoch 17 	 train loss:  0.25152 	 train acc:  0.99115 	 test loss: 36.12127 	 test acc: 0.6613
process time:  18.0 [s]
epoch 18 	 train loss:  0.24455 	 train acc:  0.99115 	 test loss: 36.90332 	 test acc: 0.66217
process time:  18.0 [s]
epoch 19 	 train loss:  0.23821 	 train acc:  0.99115 	 test loss: 38.2615 	 test acc: 0.66174
process time:  18.0 [s]
done.

In [8]:

import matplotlib.pyplot as plt
plt.plot(range(maxepoch), lstm_train_accuracy_list)
plt.plot(range(maxepoch), lstm_test_accuracy_list)
plt.legend(["Train_accuracy", "Test_accuracy"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()

●CNN用データデータセットの準備

In [9]:

from pathlib import Path
import pandas as pd

In [10]:

paths = list(Path('text').iterdir())
labels = []
texts = []

for path in paths:
    for filepath in path.glob('*.txt'):
        if not filepath.name == 'LICENSE.txt':
            with open(filepath) as f:
                next(f)
                next(f)
                text = f.read().replace('\u3000','').replace('\n','')

            texts.append(text)
            labels.append(path.name)

In [11]:

news_df = pd.DataFrame({
    'body': texts,
    'category': labels
})

●CNNを用いた文章分類モデルの作成

In [12]:

import fugashi
from torchtext.legacy import data
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:

tagger = fugashi.Tagger("-Owakati")

In [14]:

def make_wakati(text):
    text = tagger.parse(text)
    wakati = text.split(" ")
    wakati = list(filter(("").__ne__, wakati))
    return wakati

In [15]:

# カテゴリーをidに変換
categories = news_df["category"].unique().tolist()
news_df["category_id"] = news_df["category"].map(lambda x: categories.index(x))

# 元データを学習、検証、テストの3つに分割
train_val_df, test_df = train_test_split(news_df[["body", "category_id"]], train_size=0.8)
train_df, val_df = train_test_split(train_val_df, train_size=0.75)

print("train size", train_df.shape)
print("validation size", val_df.shape)
print("test size", test_df.shape)

# torchtext用にtsvファイルで保存
train_df.to_csv("train.tsv", sep="\t", index=False, header=None)
val_df.to_csv("val.tsv", sep="\t", index=False, header=None)
test_df.to_csv("test.tsv", sep="\t", index=False, header=None)

TEXT = data.Field(sequential=True, tokenize=make_wakati, lower=False, batch_first=True, pad_token="<pad>")
LABEL = data.Field(sequential=False, use_vocab=False)

train_data, val_data, test_data = data.TabularDataset.splits(path="/Users/xxxx/Desktop/", train="train.tsv", validation="val.tsv", test="test.tsv", format="tsv", fields=[("Text", TEXT), ("Label", LABEL)])

# vocabulary生成
# 学習データだけでvocabを作成します。
TEXT.build_vocab(train_data, min_freq=1)

BATCH_SIZE = 64
train_loader = data.Iterator(train_data, batch_size=BATCH_SIZE, train=True)
val_loader = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False)
test_loader = data.Iterator(test_data, batch_size=BATCH_SIZE, train=False, sort=False)

train size (4419, 2)
validation size (1474, 2)
test size (1474, 2)

In [16]:

class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Net, self).__init__()
        #単語分散表現はランダムベクトルを使う
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=TEXT.vocab.stoi["<pad>"])
        self.conv1 = nn.Conv2d(1, 2, kernel_size=(2, embedding_dim))
        self.conv2 = nn.Conv2d(1, 2, kernel_size=(3, embedding_dim))
        self.conv3 = nn.Conv2d(1, 2, kernel_size=(4, embedding_dim))

        #3つ畳み込み処理でそれぞれ2次元のベクトルが生成されるので、それらを全て結合して6次元のベクトルとなる
        #liveddorのカテゴリは9つなので、アウトプットサイズは9を指定
        self.linear = nn.Linear(6, 9)

    def forward(self, input_ids):
        #(i) 文章の行列を取得
        out = self.embeddings(input_ids)
        # チャネル数1を挿入
        out = out.unsqueeze(1)

        #(ii) 畳み込んでreluに通す
        out1 = F.relu(self.conv1(out))
        out2 = F.relu(self.conv2(out))
        out3 = F.relu(self.conv3(out))

        #(iii) poolingして、各特徴マップの最大要素を取得
        out1 = F.max_pool2d(out1, kernel_size=(out1.size()[2], 1))
        out2 = F.max_pool2d(out2, kernel_size=(out2.size()[2], 1))
        out3 = F.max_pool2d(out3, kernel_size=(out3.size()[2], 1))

        #(iv)viewして次元を整える
        out1 = out1.view(-1, 2)
        out2 = out2.view(-1, 2)
        out3 = out3.view(-1, 2)

        #(v)全部結合して1本のベクトルにする
        out = torch.cat([out1, out2, out3], dim=1)

        #(vi)全結合層で9つのカテゴリー分類できるように変換
        out = self.linear(out)

        return out

In [17]:

import torch.optim as optim

VOCAB_SIZE = len(TEXT.vocab.stoi)
EMBEDDING_DIM = 200

net = Net(VOCAB_SIZE, EMBEDDING_DIM)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# 損失、精度を格納する配列を準備
train_loss = []
val_loss = []
train_accuracy = []
val_accuracy = []

# グラフ反映用のリストを準備
cnn_train_accuracy_list = []
cnn_test_accuracy_list = []

maxepoch = 20
for epoch in range(maxepoch):
    start_time = time.process_time()

    #学習
    _train_loss = 0.0
    _train_acc = 0.0
    net.train()
    for batch in train_loader:
        inputs = batch.Text
        y = batch.Label
        optimizer.zero_grad()
        out = net(inputs)
        loss = loss_function(out, y)
        _, preds = torch.max(out, 1)
        loss.backward()
        optimizer.step()
        _train_loss += loss.item()
        _train_acc += torch.sum(preds == y).item()
    train_loss.append(_train_loss)
    train_epoch_acc = _train_acc / len(train_loader.dataset)
    train_accuracy.append(train_epoch_acc)

    #検証
    _val_loss = 0.0
    _val_acc = 0.0
    net.eval()
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch.Text
            y = batch.Label
            out = net(inputs)
            loss = loss_function(out, y)
            _, preds = torch.max(out, 1)
            _val_loss += loss.item()
            _val_acc += torch.sum(preds == y).item()
    val_loss.append(_val_loss)
    val_epoch_acc = _val_acc / len(val_loader.dataset)
    val_accuracy.append(val_epoch_acc)

    print("epoch", epoch,
         "\ttrain loss", round(_train_loss, 4), "\ttrain accuracy", round(train_epoch_acc, 4),
         "\tval loss", round(_val_loss, 4), "\tval accuracy", round(val_epoch_acc, 4))
    cnn_train_accuracy_list.append(round(train_epoch_acc, 4))
    cnn_test_accuracy_list.append(round(val_epoch_acc, 4))

    end_time = time.process_time()
    elapsed_time = end_time - start_time
    print("time:", elapsed_time)

epoch 0 	train loss 148.5192 	train accuracy 0.2347 	val loss 43.8903 	val accuracy 0.4043
time: 362.094066
epoch 1 	train loss 107.3851 	train accuracy 0.535 	val loss 31.9789 	val accuracy 0.5936
time: 358.88564099999985
epoch 2 	train loss 79.805 	train accuracy 0.6766 	val loss 24.9609 	val accuracy 0.6913
time: 352.265895
epoch 3 	train loss 64.3195 	train accuracy 0.7461 	val loss 21.1068 	val accuracy 0.7252
time: 366.32607200000007
epoch 4 	train loss 53.5912 	train accuracy 0.7905 	val loss 18.155 	val accuracy 0.7612
time: 362.6128030000002
epoch 5 	train loss 43.6401 	train accuracy 0.8228 	val loss 16.4422 	val accuracy 0.7802
time: 366.38400500000034
epoch 6 	train loss 39.1445 	train accuracy 0.8592 	val loss 14.9904 	val accuracy 0.8005
time: 356.82478200000014
epoch 7 	train loss 32.4194 	train accuracy 0.8823 	val loss 14.2734 	val accuracy 0.8094
time: 363.46790199999987
epoch 8 	train loss 28.7035 	train accuracy 0.9016 	val loss 13.5009 	val accuracy 0.8202
time: 358.15388299999995
epoch 9 	train loss 24.9321 	train accuracy 0.9145 	val loss 12.9508 	val accuracy 0.8209
time: 362.11623599999984
epoch 10 	train loss 20.6999 	train accuracy 0.9335 	val loss 12.581 	val accuracy 0.8263
time: 363.47333400000025
epoch 11 	train loss 18.2341 	train accuracy 0.9443 	val loss 12.122 	val accuracy 0.8236
time: 368.4358520000005
epoch 12 	train loss 16.1649 	train accuracy 0.9525 	val loss 11.8146 	val accuracy 0.8412
time: 360.1432559999994
epoch 13 	train loss 14.2326 	train accuracy 0.9613 	val loss 11.598 	val accuracy 0.8412
time: 374.934569
epoch 14 	train loss 12.0825 	train accuracy 0.969 	val loss 11.3908 	val accuracy 0.8474
time: 360.7733240000007
epoch 15 	train loss 10.4957 	train accuracy 0.9737 	val loss 11.2721 	val accuracy 0.8487
time: 366.32139500000085
epoch 16 	train loss 9.0818 	train accuracy 0.981 	val loss 11.2058 	val accuracy 0.8548
time: 362.1448909999999
epoch 17 	train loss 8.1702 	train accuracy 0.981 	val loss 11.1899 	val accuracy 0.8535
time: 361.6823430000004
epoch 18 	train loss 7.3026 	train accuracy 0.9844 	val loss 11.198 	val accuracy 0.8507
time: 359.6158379999997
epoch 19 	train loss 6.7623 	train accuracy 0.9869 	val loss 11.2555 	val accuracy 0.8507
time: 368.5266160000001

In [18]:

import matplotlib.pyplot as plt
plt.plot(range(maxepoch), cnn_train_accuracy_list)
plt.plot(range(maxepoch), cnn_test_accuracy_list)
plt.legend(["Train_accuracy", "Test_accuracy"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()

●2つのモデルの精度比較

In [19]:

import matplotlib.pyplot as plt
plt.plot(range(maxepoch), lstm_test_accuracy_list)
plt.plot(range(maxepoch), cnn_test_accuracy_list)
plt.legend(["LSTM", "CNN"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()

結果: 20epochが完了し、 LSTM: 0.66, CNN: 　0.85の精度となった。そのため、今回作成したモデルではCNNの方が良い精度となった。
考察:

両モデルの学習データで、LSTM用のdatasets["title"]とCNN用のnews_df["body"]を比較すると、CNNの方がデータが大きい。その分学習時間を要するが、より深い学習ができたのではないかと考える

両モデルについて、サイトを参考に作成し、ハイパーパラメータのチューニングは特に行っていないため、チューニングすることで精度の向上を図ることができると考える。 LSTMの方が、調整できるパラメータが多いので、今回の精度0.66から伸び代は大きくあると考えており、特に単語の埋め込み次元をを増やすことでベクトルでの表現力が向上し精度もアップすると考える。

作成したモデルは、dropout等の正則化は利用していない。そのため、正則化することで精度の向上を期待できる。

In [20]:

datasets.head() # 参考 datasetsの中身

Out[20]:

	title	category
0	【DVDエンター！】誘拐犯に育てられた女が目にした真実は、孤独か幸福か\n	movie-enter
1	藤原竜也、中学生とともにロケット打ち上げに成功\n	movie-enter
2	『戦火の馬』ロイヤル・プレミアにウィリアム王子＆キャサリン妃が出席\n	movie-enter
3	香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」\n	movie-enter
4	ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」\n	movie-enter

In [21]:

news_df.head() #参考 news_dfの中身

Out[21]:

	body	category
0	【DVDエンター！】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌...	movie-enter
1	藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2...	movie-enter
2	『戦火の馬』ロイヤル・プレミアにウィリアム王子＆キャサリン妃が出席3月2日より全国ロードショ...	movie-enter
3	香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で...	movie-enter
4	ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町...	movie-enter

In [ ]:

コメント・お問合せ