Keras(二十九)LSTM文本分类
一,从keras数据集imdb中加载影评数据imdb = keras.datasets.imdbvocab_size = 10000# 出现词频由高到低, 截取前10000个词组,其余按特殊字符处理index_from = 3# 截取的单词和对应索引,向后平移3个单位(train_data, train_labels), (test_data, test_labels) = imdb.load_d
·
一,从keras数据集imdb中加载影评数据
1,加载数据
imdb = keras.datasets.imdb
vocab_size = 10000
index_from = 3
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words = vocab_size, index_from = index_from)
2,获取id对应文字字典
word_index = imdb.get_word_index()
print(len(word_index))
# print(word_index)
word_index = {k:(v+3) for k, v in word_index.items()}
3,向字典中加入特殊字符
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<END>'] = 3
reverse_word_index = dict(
[(value, key) for key, value in word_index.items()])
4,翻译 id -> 文字
def decode_review(text_ids):
return ' '.join(
[reverse_word_index.get(word_id, "<UNK>") for word_id in text_ids])
decode_review(train_data[0])
二,对数据做padding操作
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences(
train_data, # list of list
value = word_index['<PAD>'],
padding = 'post', # post, pre
maxlen = max_length)
test_data = keras.preprocessing.sequence.pad_sequences(
test_data, # list of list
value = word_index['<PAD>'],
padding = 'post', # post, pre
maxlen = max_length)
print(train_data[0])
三,构建模型
1,构建单层单向的LSTM模型(single_rnn_model)
# embedding_dim = 16
# batch_size = 512
# model = keras.models.Sequential([
# # 1. define matrix: [vocab_size, embedding_dim]
# # 2. [1,2,3,4..], max_length * embedding_dim
# # 3. batch_size * max_length * embedding_dim
# keras.layers.Embedding(vocab_size, embedding_dim,
# input_length = max_length),
# keras.layers.LSTM(units = 64, return_sequences = False),
# keras.layers.Dense(64, activation = 'relu'),
# keras.layers.Dense(1, activation='sigmoid'),
# ])
# model.summary()
# model.compile(optimizer = 'adam',
# loss = 'binary_crossentropy',
# metrics = ['accuracy'])
2,构建单层单向的LSTM模型(double_rnn_model)
embedding_dim = 16
batch_size = 512
model = keras.models.Sequential([
# 1. define matrix: [vocab_size, embedding_dim]
# 2. [1,2,3,4..], max_length * embedding_dim
# 3. batch_size * max_length * embedding_dim
keras.layers.Embedding(vocab_size, embedding_dim,
input_length = max_length),
keras.layers.Bidirectional(
keras.layers.LSTM(
units = 64, return_sequences = True)),
keras.layers.Bidirectional(
keras.layers.LSTM(
units = 64, return_sequences = False)),
keras.layers.Dense(64, activation = 'relu'),
keras.layers.Dense(1, activation='sigmoid'),
])
model.summary()
model.compile(optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
四,训练模型
history_single_rnn = model.fit(
train_data, train_labels,
epochs = 30,
batch_size = batch_size,
validation_split = 0.2)
五,打印训练图表
def plot_learning_curves(history, label, epochs, min_value, max_value):
data = {}
data[label] = history.history[label]
data['val_'+label] = history.history['val_'+label]
pd.DataFrame(data).plot(figsize=(8, 5))
plt.grid(True)
plt.axis([0, epochs, min_value, max_value])
plt.show()
plot_learning_curves(history_single_rnn, 'accuracy', 30, 0, 1)
plot_learning_curves(history_single_rnn, 'loss', 30, 0, 1)
六,打印测试集结果
res_test = model.evaluate(
test_data, test_labels,
batch_size = batch_size,
verbose = 0)
print(res_test)
七,总结代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 4 10:52:02 2021
@author: nijiahui
"""
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
# 一,处理数据
# 1,加载数据
imdb = keras.datasets.imdb
vocab_size = 10000
index_from = 3
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words = vocab_size, index_from = index_from)
# 2,获取id对应文字字典
word_index = imdb.get_word_index()
print(len(word_index))
# print(word_index)
word_index = {k:(v+3) for k, v in word_index.items()}
# 3,向字典中加入特殊字符
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<END>'] = 3
reverse_word_index = dict(
[(value, key) for key, value in word_index.items()])
# 4,翻译id -> 文字
def decode_review(text_ids):
return ' '.join(
[reverse_word_index.get(word_id, "<UNK>") for word_id in text_ids])
decode_review(train_data[0])
# 二,对数据做padding操作
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences(
train_data, # list of list
value = word_index['<PAD>'],
padding = 'post', # post, pre
maxlen = max_length)
test_data = keras.preprocessing.sequence.pad_sequences(
test_data, # list of list
value = word_index['<PAD>'],
padding = 'post', # post, pre
maxlen = max_length)
print(train_data[0])
# 三,构建模型
# # 1,构建单层单向的LSTM模型(single_rnn_model)
# embedding_dim = 16
# batch_size = 512
# model = keras.models.Sequential([
# # 1. define matrix: [vocab_size, embedding_dim]
# # 2. [1,2,3,4..], max_length * embedding_dim
# # 3. batch_size * max_length * embedding_dim
# keras.layers.Embedding(vocab_size, embedding_dim,
# input_length = max_length),
# keras.layers.LSTM(units = 64, return_sequences = False),
# keras.layers.Dense(64, activation = 'relu'),
# keras.layers.Dense(1, activation='sigmoid'),
# ])
# model.summary()
# model.compile(optimizer = 'adam',
# loss = 'binary_crossentropy',
# metrics = ['accuracy'])
# 2,构建单层单向的LSTM模型(double_rnn_model)
embedding_dim = 16
batch_size = 512
model = keras.models.Sequential([
# 1. define matrix: [vocab_size, embedding_dim]
# 2. [1,2,3,4..], max_length * embedding_dim
# 3. batch_size * max_length * embedding_dim
keras.layers.Embedding(vocab_size, embedding_dim,
input_length = max_length),
keras.layers.Bidirectional(
keras.layers.LSTM(
units = 64, return_sequences = True)),
keras.layers.Bidirectional(
keras.layers.LSTM(
units = 64, return_sequences = False)),
keras.layers.Dense(64, activation = 'relu'),
keras.layers.Dense(1, activation='sigmoid'),
])
model.summary()
model.compile(optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
# 四,训练模型
history_single_rnn = model.fit(
train_data, train_labels,
epochs = 30,
batch_size = batch_size,
validation_split = 0.2)
# 五,打印训练图表
def plot_learning_curves(history, label, epochs, min_value, max_value):
data = {}
data[label] = history.history[label]
data['val_'+label] = history.history['val_'+label]
pd.DataFrame(data).plot(figsize=(8, 5))
plt.grid(True)
plt.axis([0, epochs, min_value, max_value])
plt.show()
plot_learning_curves(history_single_rnn, 'accuracy', 30, 0, 1)
plot_learning_curves(history_single_rnn, 'loss', 30, 0, 1)
# 六,打印测试集结果
res_test = model.evaluate(
test_data, test_labels,
batch_size = batch_size,
verbose = 0)
print(res_test)
更多推荐
所有评论(0)