1、基于词袋模型的逻辑回归情感分类
# coding: utf-8import reimport numpy as npimport pandas as pdfrom bs4 import BeautifulSoupfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.metrics import confusion_matrixfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltimport itertools###########################词袋模型特征#############################################重组为新的句子def clean_text(text): """ 去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子 :param text: :return: """ print(text) text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords/stopwords_english.txt')]) eng_stopwords = set(stopwords) print(eng_stopwords) words = [w for w in words if w not in eng_stopwords] print(words) return ' '.join(words)#混淆矩阵def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label')if __name__=='__main__': #读取数据 df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\') print(df.head(5)) #数据清洗,对df中的每一个Serial进行清洗 df['clean_review'] = df.review.apply(clean_text) print(df['clean_review']) #抽取bag of words特征(用sklearn的CountVectorizer) vectorizer = CountVectorizer(max_features=5000) train_data_features = vectorizer.fit_transform(df.clean_review).toarray() print(train_data_features) # 数据切分 X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0) print(X_train,X_test,y_train,y_test) # ### 训练分类器 LR_model = LogisticRegression() LR_model = LR_model.fit(X_train, y_train) y_pred = LR_model.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_pred) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])) print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / ( cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1])) # Plot non-normalized confusion matrix class_names = [0, 1] plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix') plt.show()
2、基于word2vec词向量模型的逻辑回归情感分类
import reimport numpy as npimport pandas as pdfrom bs4 import BeautifulSoupfrom sklearn.metrics import confusion_matrixfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitimport nltkimport warningsfrom gensim.models.word2vec import Word2Vecfrom nltk.corpus import stopwordsimport matplotlib.pyplot as pltimport itertoolswarnings.filterwarnings("ignore")def clean_text(text, remove_stopwords=False): text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() eng_stopwords = set(stopwords.words('english')) if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return wordsdef split_sentences(review): #print(type(review)) raw_sentences=tokenizer.tokenize(str(review).strip()) sentences = [clean_text(s) for s in raw_sentences if s] return sentencesdef to_review_vector(review): global word_vec review = clean_text(review, remove_stopwords=True) # print (review) # words = nltk.word_tokenize(review) word_vec = np.zeros((1, 300)) for word in review: # word_vec = np.zeros((1,300)) if word in model: word_vec += np.array([model[word]]) # print (word_vec.mean(axis = 0)) return pd.Series(word_vec.mean(axis=0))def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label')if __name__ == '__main__': #读取数据 df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\') #数据清洗 df['clean_review'] = df.review.apply(clean_text) review_part = df['clean_review'] #nltk库分词 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sum(review_part.apply(split_sentences), []) sentences_list = [] for line in sentences: sentences_list.append(nltk.word_tokenize(str(line).strip())) #word2vec num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context) model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context) model.init_sims(replace=True) model.save('word2vec.models') train_data_features = df.review.apply(to_review_vector) X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0) LR_model = LogisticRegression() LR_model = LR_model.fit(X_train, y_train) y_pred = LR_model.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_pred) print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])) print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / ( cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1])) # Plot non-normalized confusion matrix class_names = [0, 1] plt.figure() plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix') plt.show()