NLP之电影评分数据的情感分析-白红宇

NLP之电影评分数据的情感分析

阅读量：4488 次

发布时间：2019-06-08

本文共 6953 字，大约阅读时间需要 23 分钟。

1、基于词袋模型的逻辑回归情感分类

# coding: utf-8import reimport numpy as npimport pandas as pdfrom bs4 import BeautifulSoupfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.metrics import confusion_matrixfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltimport itertools###########################词袋模型特征#############################################重组为新的句子def clean_text(text):    """    去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子    :param text:    :return:    """    print(text)    text = BeautifulSoup(text, 'html.parser').get_text()    text = re.sub(r'[^a-zA-Z]', ' ', text)    words = text.lower().split()    stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords/stopwords_english.txt')])    eng_stopwords = set(stopwords)    print(eng_stopwords)    words = [w for w in words if w not in eng_stopwords]    print(words)    return ' '.join(words)#混淆矩阵def plot_confusion_matrix(cm, classes,                          title='Confusion matrix',                          cmap=plt.cm.Blues):    """    This function prints and plots the confusion matrix.    """    plt.imshow(cm, interpolation='nearest', cmap=cmap)    plt.title(title)    plt.colorbar()    tick_marks = np.arange(len(classes))    plt.xticks(tick_marks, classes, rotation=0)    plt.yticks(tick_marks, classes)    thresh = cm.max() / 2.    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):        plt.text(j, i, cm[i, j],                 horizontalalignment="center",                 color="white" if cm[i, j] > thresh else "black")    plt.tight_layout()    plt.ylabel('True label')    plt.xlabel('Predicted label')if __name__=='__main__':    #读取数据    df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')    print(df.head(5))    #数据清洗,对df中的每一个Serial进行清洗    df['clean_review'] = df.review.apply(clean_text)    print(df['clean_review'])    #抽取bag of words特征(用sklearn的CountVectorizer)    vectorizer = CountVectorizer(max_features=5000)    train_data_features = vectorizer.fit_transform(df.clean_review).toarray()    print(train_data_features)    # 数据切分    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2,                                                    random_state=0)    print(X_train,X_test,y_train,y_test)    # ### 训练分类器    LR_model = LogisticRegression()    LR_model = LR_model.fit(X_train, y_train)    y_pred = LR_model.predict(X_test)    cnf_matrix = confusion_matrix(y_test, y_pred)    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))    # Plot non-normalized confusion matrix    class_names = [0, 1]    plt.figure()    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')    plt.show()

2、基于word2vec词向量模型的逻辑回归情感分类

import reimport numpy as npimport pandas as pdfrom bs4 import BeautifulSoupfrom sklearn.metrics import confusion_matrixfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitimport nltkimport warningsfrom gensim.models.word2vec import Word2Vecfrom nltk.corpus import stopwordsimport matplotlib.pyplot as pltimport itertoolswarnings.filterwarnings("ignore")def clean_text(text, remove_stopwords=False):    text = BeautifulSoup(text, 'html.parser').get_text()    text = re.sub(r'[^a-zA-Z]', ' ', text)    words = text.lower().split()    eng_stopwords = set(stopwords.words('english'))    if remove_stopwords:        words = [w for w in words if w not in eng_stopwords]    return wordsdef split_sentences(review):    #print(type(review))    raw_sentences=tokenizer.tokenize(str(review).strip())    sentences = [clean_text(s) for s in raw_sentences if s]    return sentencesdef to_review_vector(review):    global word_vec    review = clean_text(review, remove_stopwords=True)    # print (review)    # words = nltk.word_tokenize(review)    word_vec = np.zeros((1, 300))    for word in review:        # word_vec = np.zeros((1,300))        if word in model:            word_vec += np.array([model[word]])    # print (word_vec.mean(axis = 0))    return pd.Series(word_vec.mean(axis=0))def plot_confusion_matrix(cm, classes,                          title='Confusion matrix',                          cmap=plt.cm.Blues):    """    This function prints and plots the confusion matrix.    """    plt.imshow(cm, interpolation='nearest', cmap=cmap)    plt.title(title)    plt.colorbar()    tick_marks = np.arange(len(classes))    plt.xticks(tick_marks, classes, rotation=0)    plt.yticks(tick_marks, classes)    thresh = cm.max() / 2.    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):        plt.text(j, i, cm[i, j],                 horizontalalignment="center",                 color="white" if cm[i, j] > thresh else "black")    plt.tight_layout()    plt.ylabel('True label')    plt.xlabel('Predicted label')if __name__ == '__main__':    #读取数据    df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')    #数据清洗    df['clean_review'] = df.review.apply(clean_text)    review_part = df['clean_review']    #nltk库分词    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')    sentences = sum(review_part.apply(split_sentences), [])    sentences_list = []    for line in sentences:        sentences_list.append(nltk.word_tokenize(str(line).strip()))    #word2vec    num_features = 300  # Word vector dimensionality    min_word_count = 40  # Minimum word count    num_workers = 4  # Number of threads to run in parallel    context = 10  # Context window size    model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)    model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)    model.init_sims(replace=True)    model.save('word2vec.models')    train_data_features = df.review.apply(to_review_vector)    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0)    LR_model = LogisticRegression()    LR_model = LR_model.fit(X_train, y_train)    y_pred = LR_model.predict(X_test)    cnf_matrix = confusion_matrix(y_test, y_pred)    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))    # Plot non-normalized confusion matrix    class_names = [0, 1]    plt.figure()    plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')    plt.show()

转载于:https://www.cnblogs.com/ywjfx/p/11119175.html

你可能感兴趣的文章

每天一个linux命令（42）：kill命令

查看>>