
プロダクト開発演習
テーマ: ライブドアニュースコーパスの文章分類 (LSTMとCNNの比較)
行ったこと: ライブドアニュースコーパスの文章分類を2つのモデル(LSTM・CNN)を用いて精度比較し、高精度なモデルを選定した。
参考としたサイト:
- LSTMについて https://qiita.com/m__k/items/841950a57a0d7ff05506
- CNNについて https://qiita.com/m__k/items/6c39cfe7dfa99102fa8e
https://qiita.com/m__k/items/db1a81bb06607d5b0ec5
https://arxiv.org/pdf/1510.03820.pdf
●LSTM用データセットの準備
In [1]:
import os
from glob import glob
import pandas as pd
import linecache
categories = [name for name in os.listdir('text') if os.path.isdir("text/" +name)]
print(categories)
datasets = pd.DataFrame(columns=["title", "category"])
for cat in categories:
path = "text/" + cat + "/*.txt"
files = glob(path)
for text_name in files:
title = linecache.getline(text_name, 3)
s = pd.Series([title, cat], index=datasets.columns)
datasets = datasets.append(s, ignore_index=True)
['movie-enter', 'it-life-hack', 'kaden-channel', 'topic-news', 'livedoor-homme', 'peachy', 'sports-watch', 'dokujo-tsushin', 'smax']
●LSTMを用いた文章分類モデルの作成
In [2]:
import MeCab
import time
In [3]:
import MeCab
import re
tagger = MeCab.Tagger("-Owakati")
def make_wakati(sentence):
sentence = tagger.parse(sentence)
sentence = re.sub(r'[0-90-9a-zA-Za-zA-Z]+', " ", sentence)
sentence = re.sub(r'[\._-―─!@#$%^&\-‐|\\*\“()_■×+α※÷⇒—●★☆〇◎◆▼◇△□(:〜~+=)/*&^%$#@!~`){}[]…\[\]\"\'\”\’:;<>?<>〔〕〈〉?、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
wakati = sentence.split(" ")
wakati = list(filter(("").__ne__, wakati))
return wakati
In [4]:
word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})
for title in datasets["title"]:
wakati = make_wakati(title)
for word in wakati:
if word in word2index: continue
word2index[word] = len(word2index)
print("vocab size : ", len(word2index))
vocab size : 12944
In [5]:
from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle
cat2index = {}
for cat in categories:
if cat in cat2index: continue
cat2index[cat] = len(cat2index)
def sentence2index(sentence):
wakati = make_wakati(sentence)
return [word2index[w] for w in wakati]
def category2index(cat):
return [cat2index[cat]]
index_datasets_title_tmp = []
index_datasets_category = []
# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for title, category in zip(datasets["title"], datasets["category"]):
index_title = sentence2index(title)
index_category = category2index(category)
index_datasets_title_tmp.append(index_title)
index_datasets_category.append(index_category)
if max_len < len(index_title):
max_len = len(index_title)
# 系列の長さを揃えるために短い系列にパディングを追加
# 後ろパディングだと正しく学習できなかったので、前パディング
index_datasets_title = []
for title in index_datasets_title_tmp:
for i in range(max_len - len(title)):
title.insert(0, 0) # 前パディング
# title.append(0) # 後ろパディング
index_datasets_title.append(title)
train_x, test_x, train_y, test_y = train_test_split(index_datasets_title, index_datasets_category, train_size=0.7)
# データをバッチでまとめるための関数
def train2batch(title, category, batch_size=100):
title_batch = []
category_batch = []
title_shuffle, category_shuffle = shuffle(title, category)
for i in range(0, len(title), batch_size):
title_batch.append(title_shuffle[i:i+batch_size])
category_batch.append(category_shuffle[i:i+batch_size])
return title_batch, category_batch
In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, layer_number):
super(LSTMClassifier, self).__init__()
self.hidden_dim = hidden_dim
# <pad>の単語IDが0なので、padding_idx=0としている
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
# batch_first=Trueが大事!
self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=layer_number, batch_first=True)
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.softmax = nn.LogSoftmax()
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
#embeds.size() = (batch_size × len(sentence) × embedding_dim)
_, lstm_out = self.lstm(embeds)
# lstm_out[0].size() = (1 × batch_size × hidden_dim)
tag_space = self.hidden2tag(lstm_out[0][2])
# tag_space.size() = (1 × batch_size × tagset_size)
# (batch_size × tagset_size)にするためにsqueeze()する
tag_scores = self.softmax(tag_space.squeeze())
# tag_scores.size() = (batch_size × tagset_size)
return tag_scores
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)
LAYER_NUMBER = 3
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE, LAYER_NUMBER)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
In [7]:
lstm_train_accuracy_list = []
lstm_test_accuracy_list = []
maxepoch = 20
for epoch in range(maxepoch):
start_time = time.process_time()
temp_train_acc = 0
train_loss = 0
train_acc = 0
title_batch, category_batch = train2batch(train_x, train_y)
for i in range(len(title_batch)):
batch_loss = 0
model.zero_grad()
title_tensor = torch.tensor(title_batch[i])
# category_tensor.size() = (batch_size × 1)なので、squeeze()
category_tensor = torch.tensor(category_batch[i]).squeeze()
out = model(title_tensor)
batch_loss = loss_function(out, category_tensor)
_, preds = torch.max(out, 1)
batch_loss.backward()
optimizer.step()
train_loss += batch_loss.item()
temp_train_acc += torch.sum(preds==category_tensor).item()
train_acc = temp_train_acc / (len(title_batch)*100)
temp_test_acc = 0
test_loss = 0
test_acc = 0
test_num = len(test_x)
a = 0
with torch.no_grad():
title_batch, category_batch = train2batch(test_x, test_y)
for i in range(len(title_batch)):
title_tensor = torch.tensor(title_batch[i])
category_tensor = torch.tensor(category_batch[i]).squeeze()
out = model(title_tensor)
batch_loss=loss_function(out, category_tensor)
_, preds = torch.max(out, 1)
test_loss += batch_loss.item()
temp_test_acc += torch.sum(preds==category_tensor).item()
test_acc = temp_test_acc / (len(title_batch)*100)
print("epoch", epoch, "\t" , "train loss: ", round(train_loss, 5), "\t" , "train acc: ", round(train_acc, 5), "\t" , "test loss:", round(test_loss, 5), "\t" , "test acc:", round(test_acc, 5))
lstm_train_accuracy_list.append(train_acc)
lstm_test_accuracy_list.append(test_acc)
end_time = time.process_time()
elapsed_time = end_time - start_time
print("process time: ", round(elapsed_time, 0), "[s]")
print("done.")
<ipython-input-6-81593e8a8fc4>:26: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument. tag_scores = self.softmax(tag_space.squeeze())
epoch 0 train loss: 101.45796 train acc: 0.28654 test loss: 36.7009 test acc: 0.41739 process time: 19.0 [s] epoch 1 train loss: 64.87202 train acc: 0.56288 test loss: 27.30444 test acc: 0.56826 process time: 18.0 [s] epoch 2 train loss: 40.17011 train acc: 0.73115 test loss: 24.36324 test acc: 0.62435 process time: 19.0 [s] epoch 3 train loss: 23.96207 train acc: 0.84827 test loss: 24.78822 test acc: 0.6413 process time: 18.0 [s] epoch 4 train loss: 13.94799 train acc: 0.91269 test loss: 25.57045 test acc: 0.6513 process time: 18.0 [s] epoch 5 train loss: 6.92676 train acc: 0.95962 test loss: 27.38291 test acc: 0.65957 process time: 18.0 [s] epoch 6 train loss: 4.59388 train acc: 0.96962 test loss: 28.50567 test acc: 0.65696 process time: 18.0 [s] epoch 7 train loss: 2.26921 train acc: 0.98327 test loss: 30.072 test acc: 0.66 process time: 18.0 [s] epoch 8 train loss: 1.30224 train acc: 0.98788 test loss: 31.47531 test acc: 0.66174 process time: 18.0 [s] epoch 9 train loss: 0.84991 train acc: 0.98962 test loss: 31.78433 test acc: 0.6613 process time: 18.0 [s] epoch 10 train loss: 0.70333 train acc: 0.99 test loss: 32.47073 test acc: 0.65957 process time: 18.0 [s] epoch 11 train loss: 0.54112 train acc: 0.99058 test loss: 33.76472 test acc: 0.66478 process time: 18.0 [s] epoch 12 train loss: 0.39043 train acc: 0.99096 test loss: 34.48341 test acc: 0.66696 process time: 18.0 [s] epoch 13 train loss: 0.3547 train acc: 0.99115 test loss: 35.85314 test acc: 0.66565 process time: 18.0 [s] epoch 14 train loss: 0.33564 train acc: 0.99096 test loss: 34.98831 test acc: 0.66435 process time: 18.0 [s] epoch 15 train loss: 0.31613 train acc: 0.99115 test loss: 35.73679 test acc: 0.66043 process time: 18.0 [s] epoch 16 train loss: 0.27396 train acc: 0.99115 test loss: 35.56435 test acc: 0.66565 process time: 18.0 [s] epoch 17 train loss: 0.25152 train acc: 0.99115 test loss: 36.12127 test acc: 0.6613 process time: 18.0 [s] epoch 18 train loss: 0.24455 train acc: 0.99115 test loss: 36.90332 test acc: 0.66217 process time: 18.0 [s] epoch 19 train loss: 0.23821 train acc: 0.99115 test loss: 38.2615 test acc: 0.66174 process time: 18.0 [s] done.
In [8]:
import matplotlib.pyplot as plt
plt.plot(range(maxepoch), lstm_train_accuracy_list)
plt.plot(range(maxepoch), lstm_test_accuracy_list)
plt.legend(["Train_accuracy", "Test_accuracy"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()
●CNN用データデータセットの準備
In [9]:
from pathlib import Path
import pandas as pd
In [10]:
paths = list(Path('text').iterdir())
labels = []
texts = []
for path in paths:
for filepath in path.glob('*.txt'):
if not filepath.name == 'LICENSE.txt':
with open(filepath) as f:
next(f)
next(f)
text = f.read().replace('\u3000','').replace('\n','')
texts.append(text)
labels.append(path.name)
In [11]:
news_df = pd.DataFrame({
'body': texts,
'category': labels
})
●CNNを用いた文章分類モデルの作成
In [12]:
import fugashi
from torchtext.legacy import data
import torch
import torch.nn as nn
import torch.nn.functional as F
In [13]:
tagger = fugashi.Tagger("-Owakati")
In [14]:
def make_wakati(text):
text = tagger.parse(text)
wakati = text.split(" ")
wakati = list(filter(("").__ne__, wakati))
return wakati
In [15]:
# カテゴリーをidに変換
categories = news_df["category"].unique().tolist()
news_df["category_id"] = news_df["category"].map(lambda x: categories.index(x))
# 元データを学習、検証、テストの3つに分割
train_val_df, test_df = train_test_split(news_df[["body", "category_id"]], train_size=0.8)
train_df, val_df = train_test_split(train_val_df, train_size=0.75)
print("train size", train_df.shape)
print("validation size", val_df.shape)
print("test size", test_df.shape)
# torchtext用にtsvファイルで保存
train_df.to_csv("train.tsv", sep="\t", index=False, header=None)
val_df.to_csv("val.tsv", sep="\t", index=False, header=None)
test_df.to_csv("test.tsv", sep="\t", index=False, header=None)
TEXT = data.Field(sequential=True, tokenize=make_wakati, lower=False, batch_first=True, pad_token="<pad>")
LABEL = data.Field(sequential=False, use_vocab=False)
train_data, val_data, test_data = data.TabularDataset.splits(path="/Users/xxxx/Desktop/", train="train.tsv", validation="val.tsv", test="test.tsv", format="tsv", fields=[("Text", TEXT), ("Label", LABEL)])
# vocabulary生成
# 学習データだけでvocabを作成します。
TEXT.build_vocab(train_data, min_freq=1)
BATCH_SIZE = 64
train_loader = data.Iterator(train_data, batch_size=BATCH_SIZE, train=True)
val_loader = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False)
test_loader = data.Iterator(test_data, batch_size=BATCH_SIZE, train=False, sort=False)
train size (4419, 2) validation size (1474, 2) test size (1474, 2)
In [16]:
class Net(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(Net, self).__init__()
#単語分散表現はランダムベクトルを使う
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=TEXT.vocab.stoi["<pad>"])
self.conv1 = nn.Conv2d(1, 2, kernel_size=(2, embedding_dim))
self.conv2 = nn.Conv2d(1, 2, kernel_size=(3, embedding_dim))
self.conv3 = nn.Conv2d(1, 2, kernel_size=(4, embedding_dim))
#3つ畳み込み処理でそれぞれ2次元のベクトルが生成されるので、それらを全て結合して6次元のベクトルとなる
#liveddorのカテゴリは9つなので、アウトプットサイズは9を指定
self.linear = nn.Linear(6, 9)
def forward(self, input_ids):
#(i) 文章の行列を取得
out = self.embeddings(input_ids)
# チャネル数1を挿入
out = out.unsqueeze(1)
#(ii) 畳み込んでreluに通す
out1 = F.relu(self.conv1(out))
out2 = F.relu(self.conv2(out))
out3 = F.relu(self.conv3(out))
#(iii) poolingして、各特徴マップの最大要素を取得
out1 = F.max_pool2d(out1, kernel_size=(out1.size()[2], 1))
out2 = F.max_pool2d(out2, kernel_size=(out2.size()[2], 1))
out3 = F.max_pool2d(out3, kernel_size=(out3.size()[2], 1))
#(iv)viewして次元を整える
out1 = out1.view(-1, 2)
out2 = out2.view(-1, 2)
out3 = out3.view(-1, 2)
#(v)全部結合して1本のベクトルにする
out = torch.cat([out1, out2, out3], dim=1)
#(vi)全結合層で9つのカテゴリー分類できるように変換
out = self.linear(out)
return out
In [17]:
import torch.optim as optim
VOCAB_SIZE = len(TEXT.vocab.stoi)
EMBEDDING_DIM = 200
net = Net(VOCAB_SIZE, EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
# 損失、精度を格納する配列を準備
train_loss = []
val_loss = []
train_accuracy = []
val_accuracy = []
# グラフ反映用のリストを準備
cnn_train_accuracy_list = []
cnn_test_accuracy_list = []
maxepoch = 20
for epoch in range(maxepoch):
start_time = time.process_time()
#学習
_train_loss = 0.0
_train_acc = 0.0
net.train()
for batch in train_loader:
inputs = batch.Text
y = batch.Label
optimizer.zero_grad()
out = net(inputs)
loss = loss_function(out, y)
_, preds = torch.max(out, 1)
loss.backward()
optimizer.step()
_train_loss += loss.item()
_train_acc += torch.sum(preds == y).item()
train_loss.append(_train_loss)
train_epoch_acc = _train_acc / len(train_loader.dataset)
train_accuracy.append(train_epoch_acc)
#検証
_val_loss = 0.0
_val_acc = 0.0
net.eval()
with torch.no_grad():
for batch in val_loader:
inputs = batch.Text
y = batch.Label
out = net(inputs)
loss = loss_function(out, y)
_, preds = torch.max(out, 1)
_val_loss += loss.item()
_val_acc += torch.sum(preds == y).item()
val_loss.append(_val_loss)
val_epoch_acc = _val_acc / len(val_loader.dataset)
val_accuracy.append(val_epoch_acc)
print("epoch", epoch,
"\ttrain loss", round(_train_loss, 4), "\ttrain accuracy", round(train_epoch_acc, 4),
"\tval loss", round(_val_loss, 4), "\tval accuracy", round(val_epoch_acc, 4))
cnn_train_accuracy_list.append(round(train_epoch_acc, 4))
cnn_test_accuracy_list.append(round(val_epoch_acc, 4))
end_time = time.process_time()
elapsed_time = end_time - start_time
print("time:", elapsed_time)
epoch 0 train loss 148.5192 train accuracy 0.2347 val loss 43.8903 val accuracy 0.4043 time: 362.094066 epoch 1 train loss 107.3851 train accuracy 0.535 val loss 31.9789 val accuracy 0.5936 time: 358.88564099999985 epoch 2 train loss 79.805 train accuracy 0.6766 val loss 24.9609 val accuracy 0.6913 time: 352.265895 epoch 3 train loss 64.3195 train accuracy 0.7461 val loss 21.1068 val accuracy 0.7252 time: 366.32607200000007 epoch 4 train loss 53.5912 train accuracy 0.7905 val loss 18.155 val accuracy 0.7612 time: 362.6128030000002 epoch 5 train loss 43.6401 train accuracy 0.8228 val loss 16.4422 val accuracy 0.7802 time: 366.38400500000034 epoch 6 train loss 39.1445 train accuracy 0.8592 val loss 14.9904 val accuracy 0.8005 time: 356.82478200000014 epoch 7 train loss 32.4194 train accuracy 0.8823 val loss 14.2734 val accuracy 0.8094 time: 363.46790199999987 epoch 8 train loss 28.7035 train accuracy 0.9016 val loss 13.5009 val accuracy 0.8202 time: 358.15388299999995 epoch 9 train loss 24.9321 train accuracy 0.9145 val loss 12.9508 val accuracy 0.8209 time: 362.11623599999984 epoch 10 train loss 20.6999 train accuracy 0.9335 val loss 12.581 val accuracy 0.8263 time: 363.47333400000025 epoch 11 train loss 18.2341 train accuracy 0.9443 val loss 12.122 val accuracy 0.8236 time: 368.4358520000005 epoch 12 train loss 16.1649 train accuracy 0.9525 val loss 11.8146 val accuracy 0.8412 time: 360.1432559999994 epoch 13 train loss 14.2326 train accuracy 0.9613 val loss 11.598 val accuracy 0.8412 time: 374.934569 epoch 14 train loss 12.0825 train accuracy 0.969 val loss 11.3908 val accuracy 0.8474 time: 360.7733240000007 epoch 15 train loss 10.4957 train accuracy 0.9737 val loss 11.2721 val accuracy 0.8487 time: 366.32139500000085 epoch 16 train loss 9.0818 train accuracy 0.981 val loss 11.2058 val accuracy 0.8548 time: 362.1448909999999 epoch 17 train loss 8.1702 train accuracy 0.981 val loss 11.1899 val accuracy 0.8535 time: 361.6823430000004 epoch 18 train loss 7.3026 train accuracy 0.9844 val loss 11.198 val accuracy 0.8507 time: 359.6158379999997 epoch 19 train loss 6.7623 train accuracy 0.9869 val loss 11.2555 val accuracy 0.8507 time: 368.5266160000001
In [18]:
import matplotlib.pyplot as plt
plt.plot(range(maxepoch), cnn_train_accuracy_list)
plt.plot(range(maxepoch), cnn_test_accuracy_list)
plt.legend(["Train_accuracy", "Test_accuracy"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()
●2つのモデルの精度比較
In [19]:
import matplotlib.pyplot as plt
plt.plot(range(maxepoch), lstm_test_accuracy_list)
plt.plot(range(maxepoch), cnn_test_accuracy_list)
plt.legend(["LSTM", "CNN"])
plt.xlabel("#epoch")
plt.ylabel("Accuracy")
plt.show()
結果: 20epochが完了し、 LSTM: 0.66, CNN: 0.85の精度となった。そのため、今回作成したモデルではCNNの方が良い精度となった。
考察:
- 両モデルの学習データで、LSTM用のdatasets["title"]とCNN用のnews_df["body"]を比較すると、CNNの方がデータが大きい。その分学習時間を要するが、より深い学習ができたのではないかと考える
- 両モデルについて、サイトを参考に作成し、ハイパーパラメータのチューニングは特に行っていないため、チューニングすることで精度の向上を図ることができると考える。 LSTMの方が、調整できるパラメータが多いので、今回の精度0.66から伸び代は大きくあると考えており、特に単語の埋め込み次元をを増やすことでベクトルでの表現力が向上し精度もアップすると考える。
- 作成したモデルは、dropout等の正則化は利用していない。そのため、正則化することで精度の向上を期待できる。 </ul> 感想: LSTMよりCNNの方が、コードを理解しやすく、精度も高かった。学習時間はCNNの方が多く要したが、GPUを利用すれば問題ないように思えた。そのため、初心者が同様のタスクを行おうとするならば、CNNを選んだ方が良いように思った。
In [20]:
datasets.head() # 参考 datasetsの中身
Out[20]:
| title | category | |
|---|---|---|
| 0 | 【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か\n | movie-enter |
| 1 | 藤原竜也、中学生とともにロケット打ち上げに成功\n | movie-enter |
| 2 | 『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席\n | movie-enter |
| 3 | 香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」\n | movie-enter |
| 4 | ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」\n | movie-enter |
In [21]:
news_df.head() #参考 news_dfの中身
Out[21]:
| body | category | category_id | |
|---|---|---|---|
| 0 | 【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か2005年11月から翌... | movie-enter | 0 |
| 1 | 藤原竜也、中学生とともにロケット打ち上げに成功「アンテナを張りながら生活をしていけばいい」2... | movie-enter | 0 |
| 2 | 『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席3月2日より全国ロードショ... | movie-enter | 0 |
| 3 | 香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」女優の香里奈が18日、都内で... | movie-enter | 0 |
| 4 | ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」5日、東京・千代田区の内幸町... | movie-enter | 0 |
In [ ]:


コメント・お問合せ
以下のツイートの『返信』にてお願いいたします