自然语言处理(NLP)(二)实战篇之新闻主题分类任务
文章目录1. 构建带有Embedding层的文本分类模型数据包含内容如下:数据准备:import torchimport torchtextfrom torchtext.datasets import text_classificationimport os# 指定数据集下载路径load_data_path = "./data"if not os.path.isdir(load_data_path
·
数据包含内容如下:

数据准备:
import torch
import torchtext
from torchtext.datasets import text_classification
import os
# 指定数据集下载路径
load_data_path = "./data"
if not os.path.isdir(load_data_path):
os.mkdir(load_data_path)
# 下载数据集
train_dataset, test_dataset = text_classification.DATASETS["AG_NEWS"](root=load_data_path)
# 比较新的版本没有text_classification模块,修改如下
# train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root=load_data_path)
1. 构建带有Embedding层的文本分类模型
import torch.nn as nn
import torch.nn.functional as F
BATCH_SIZE = 16
# 有GPU优先使用GPU否则CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class TextSentiment(nn.Module):
'''文本分类模型'''
def __init__(self, vocab_size, embed_dim, num_class):
'''
description:类的初始化函数
:param vocab_size:整个语料包含的不同词汇总数
:param embed_dim:指定词嵌入的维度
:param num_class:文本分类类别总数
'''
super().__init__()
# 实例化embedding层,sparse=True代表每次对该层求解梯度时,只更新权重部分
self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
# 实例化线性层,参数分别是embed_dim和num_class
self.fc = nn.Linear(embed_dim, num_class)
# 为各层初始化权重
self.init_weights()
def init_weights(self):
'''初始化权重函数'''
# 指定初始化权重的取值范围
initrange = 0.5
# 各层权重参数都是初始化为均匀分布
self.embedding.weight.data.uniform(-initrange, initrange)
self.fc.weight.data.uniform_(-initrange, initrange)
# 偏置初始化为0
self.fc.bias.data.zero_()
def forward(self, text):
'''
:param text:文本映射后的结果
:return:与类别数尺寸相同的张量,用以判断文本类别
'''
# 获得embedding的结果embedded
# (m,32)其中m是BATCH_SIZE大小的数据中词汇总数
embedded = self.embedding(text)
# 接下来需要将(m,32)转换成(BATCH_SIZE,32)
# 以便通过fc层后能计算相应的损失
# m中包含c个BTATCH_SIZE
c = embedded.size(0) // BATCH_SIZE
embedded = embedded[:BATCH_SIZE*c]
# 转置并上升至3维的张量
embedded = embedded.transpose(1, 0).unsqueeze(0)
embedded = F.avg_pool1d(embedded, kernel_size=c)
# 需要减去新增的维度,并转至传送给f层
return self.fc(embedded[0].transpose(1, 0))
# 实例化模型
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(train_dataset.get_labels)
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)
2. 对数据进行Batch处理
def generate_batch(batch):
"""
description:生成batch数据函数
:param batch:由样本张量和对应标签的元组组成的batch_size大小的列表
:return:样本张量和标签各自的列表形式(张量)
形如:
text = tensor([sample1, sample2, ..., sampleN])
label = tensor([label1, label2, ..., labelN])
"""
# 从batch中获取标签张量
label = torch.tensor([entry[1] for entry in batch])
text = [every[0] for entry in batch]
text = torch.cat(text)
return text, label
# 举例
batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]
res = generate_batch(batch)
print(res)
# (tensor([3, 23, 2, 8, 3, 45, 21, 6]), tensor([1, 0]))
3. 构建训练与验证函数
from torch.utils.data import DataLoader
def train(train_data):
train_loss = 0
train_acc = 0
data = DataLoader(train_data, batch_siz=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
# 对data进行循环遍历,使用每个batch的数据进行参数更新
for i, (text, cls) in enumerate(data):
optimizer.zero_grad()
output = model(text)
loss = criterion(output, cls)
train_loss += loss.item()
loss.backward()
optimizer.step()
train_acc += (output.argmax(1) == cls).sum().item()
# 调整优化学习率
scheduler.step()
return train_loss / len(train_data), train_acc / len(train_data)
def valid(valid_data):
loss = 0
acc = 0
data = DataLoader(valid_data, batch_siz=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
for text, cls in data:
output = model(text)
loss = criterion(output, cls)
loss += loss.item()
acc += (output.argmax(1) == cls).sum().item()
return loss / len(valid_data), acc / len(valid_data)
4. 进行模型训练和验证
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float("inf")
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = random_split(train_dataset, [train_len, len(train_dataset) - train_len])
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(sub_train_)
valid_loss, valid_acc = train(sub_valid_)
secs = int(time.time() - start_time)
mins = secs / 60
secs = secs % 60
print('Epoch: %d' % (epoch + 1), "| time in %d minutes, %d seconds" % (mins, secs))
print(f'\tLoss: {train_loss: .4f}(train)\t|\tAcc: {train_acc * 100: .1f}%(train)')
print(f'\tLoss: {valid_loss: .4f}(valid)\t|\tAcc: {valid_acc * 100: .1f}%(valid)')
# 查看embedding层嵌入的词向量
print(model.state_dict()["embedding.weight"])
更多推荐
所有评论(0)