word2vec+xgboost分类代码+文本分类计算机毕设+论文完整的
word2vec+xgboost分类代码+文本分类计算机毕设+论文完整的
·
文本分类计算机毕设+论文完整的
import pandas as pd
import numpy as np
from collections import Counter
import re
import jieba
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc
import joblib
import gensim
from sklearn.svm import SVC
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#V wqqpython
plt.rcParams["font.sans-serif"] = ['Simhei']
plt.rcParams["axes.unicode_minus"] = False
# gensim==4.2.0
def data_process():# 数据预处理函数
data=pd.read_csv("nCoV_100k_train.labled.csv")
# print(data.columns)
data_sorce=data['微博中文内容'].values
data_label=data['情感倾向'].values
# print(len(data_sorce))
# print(Counter(data_label))
train_text_data=[]
train_text_data_label=[]
train_text_data_0=[]
train_text_data_1=[]
train_text_data__1=[]
train_text_data_1_label=[]
train_text_data_0_label=[]
train_text_data__1_label=[]
sum_idx = 0# 计数器
for idx,line in enumerate(data_sorce):
if str(data_label[idx])=='0':
if len(train_text_data_0)<11800:
line1=re.findall(u'[\u4e00-\u9fa5]',str(line))
if len(line1)>20:
sum_idx+=1
train_text_data_0.append(line)
train_text_data_0_label.append(int(data_label[idx])+1)
if str(data_label[idx]) == '1':
if len(train_text_data_1)<11800:
line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
if len(line1) > 20:
sum_idx += 1
train_text_data_1.append(line)
train_text_data_1_label.append(int(data_label[idx])+1)
if str(data_label[idx]) == '-1':
if len(train_text_data__1)<11800:
line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
if len(line1) > 20:
sum_idx += 1
train_text_data__1.append(line)
train_text_data__1_label.append(int(data_label[idx])+1)# 不允许出现负数
if sum_idx==35000:
break
train_text_data=train_text_data_0+train_text_data_1+train_text_data__1
train_text_data_label=train_text_data_0_label+train_text_data_1_label+train_text_data__1_label
print(Counter(train_text_data_label))
return train_text_data,train_text_data_label
train_text_data,train_text_data_label=data_process()
# 使用word2vec之前先进行word2vec的语料库训练 只需要训练一次 就可以
model = Word2Vec(LineSentence(open('word2vec_txt.txt', 'r', encoding='utf-8')), sg=0, vector_size=64, window=3, min_count=3, workers=4)
# 模型保存
model.save('test.model')
# 通过模型加载词向量(recommend)
model_vec = gensim.models.Word2Vec.load('test.model')
dic = model_vec.wv.index_to_key
#print(dic)
#print(len(dic))
x_train,x_test,y_train,y_test = train_test_split(train_text_data,train_text_data_label,test_size=0.2)
train_vec=[]
train_vec_label=[]
test_vec=[]
test_vec_label=[]
for idx,line in enumerate(x_train):
"""
训练急 构建向量:对每个句子的所有词向量取均值,生成一个句子的vector
"""
vec = np.zeros(64).reshape((1, 64))
count = 0
for word in jieba.cut(line , cut_all=False):
try:
vec = vec + model.wv[word].reshape((1,64))
count += 1
except KeyError:
continue
if count != 0:
vec /= count
train_vec.append(vec[0])
train_vec_label.append(y_train[idx])
for idx,line in enumerate(x_test):
"""
训练集 构建向量:对每个句子的所有词向量取均值,生成一个句子的vector
"""
vec = np.zeros(64).reshape((1, 64))
count = 0
for word in jieba.cut(line , cut_all=False):
try:
vec = vec + model.wv[word].reshape((1,64))
count += 1
except KeyError:
continue
if count != 0:
vec /= count
test_vec.append(vec[0])
test_vec_label.append(y_test[idx])
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor,XGBClassifier
from lightgbm import LGBMClassifier
clf = LGBMClassifier()
clf.fit(train_vec, y_train)
joblib.dump(clf, '../GaussianNB.pkl', compress=3)
y_pred = clf.predict(test_vec)
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score,precision_score,recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import seaborn as sn #画图模块
from sklearn.metrics import confusion_matrix
def plot_matrix(y_true, y_pred, title_name):
cm = confusion_matrix(y_true, y_pred) # 混淆矩阵
# annot = True 格上显示数字 ,fmt:显示数字的格式控制
ax = sn.heatmap(cm, annot=True, fmt='g', xticklabels=['1', '2', '3'], yticklabels=['1', '2', '3'])
# xticklabels、yticklabels指定横纵轴标签
ax.set_title(title_name) # 标题
ax.set_xlabel('predict') # x轴
ax.set_ylabel('true') # y轴
plt.show()
# 调用函数画图
plot = plot_matrix(list(y_pred), list(y_test), 'example-confusion matrix')
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score
# import seaborn as sn #画图模块
# from sklearn.metrics import confusion_matrix
# def plot_matrix(y_true, y_pred, title_name):
# cm = confusion_matrix(y_true, y_pred) # 混淆矩阵
# # annot = True 格上显示数字 ,fmt:显示数字的格式控制
# ax = sn.heatmap(cm, annot=True, fmt='g', xticklabels=['1', '2', '3'], yticklabels=['1', '2', '3'])
# # xticklabels、yticklabels指定横纵轴标签
# ax.set_title(title_name) # 标题
# ax.set_xlabel('predict') # x轴
# ax.set_ylabel('true') # y轴
# plt.show()
#
# # 调用函数画图
# plot = plot_matrix(list(y_pred), list(y_true), 'example-confusion matrix')
更多推荐
所有评论(0)