# Python实现本地文本情感分析与可视化工具


背景介绍

在影评分析、日记情绪追踪、小说情感走向分析等场景中,文本情感分析能帮助我们快速理解文本的情绪倾向。本文将介绍如何开发一个本地化工具,支持用户上传多个文本文件和自定义情感词典,通过词匹配分析情感倾向,并以柱状图分情感词云直观展示结果。该工具基于Python开发,结合文件处理、数据分析与可视化技术,适合Python中级以下开发者学习。

思路分析

工具的核心流程分为四步:
1. 文件读取:读取多文本文件(支持UTF-8/GBK编码)和自定义情感词典(正面、负面词列表)。
2. 情感分析:分词后匹配情感词典计算得分,判断情感倾向(积极/消极/中性)。
3. 统计分析:统计各情感类别的文件数量、占比和平均分。
4. 可视化:用matplotlib绘制柱状图展示分布,用wordcloud生成分情感词云(不同颜色区分)。

代码实现

1. 依赖库导入

需安装matplotlibwordcloudjieba(中文分词):

pip install matplotlib wordcloud jieba

代码中导入核心库:

import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba

2. 文件读取模块

处理多文本文件和情感词典的读取,兼容不同编码:

def read_text_files(file_paths):
    """读取多个文本文件内容,兼容UTF-8/GBK编码"""
    texts = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                texts.append(f.read())
        except UnicodeDecodeError:
            # 尝试GBK编码(常见中文文件编码)
            with open(file_path, 'r', encoding='gbk') as f:
                texts.append(f.read())
    return texts

def read_sentiment_dicts(pos_dict_path, neg_dict_path):
    """读取正面/负面情感词典,返回词集合"""
    pos_words, neg_words = set(), set()
    # 读取正面词典
    with open(pos_dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            if word:  # 过滤空行
                pos_words.add(word)
    # 读取负面词典
    with open(neg_dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            if word:
                neg_words.add(word)
    return pos_words, neg_words

3. 情感分析模块

对文本分词(中文用jieba),匹配情感词典计算得分,判断情感倾向:

def analyze_sentiment(texts, pos_words, neg_words):
    """分析文本情感:分词→匹配词典→计算得分→判断倾向"""
    results = []  # 存储每个文本的分析结果
    pos_freq, neg_freq, neutral_freq = {}, {}, {}  # 各情感词的词频
    all_words = []  # 所有文本的分词结果(用于中性词统计)

    for text in texts:
        # 中文分词(用jieba切割文本)
        words = jieba.lcut(text)
        all_words.extend(words)

        pos_count, neg_count = 0, 0  # 正面/负面词数量
        current_pos, current_neg = {}, {}  # 当前文本的情感词频

        for word in words:
            if word in pos_words:
                pos_count += 1
                current_pos[word] = current_pos.get(word, 0) + 1
            elif word in neg_words:
                neg_count += 1
                current_neg[word] = current_neg.get(word, 0) + 1

        # 更新全局情感词频
        for word, cnt in current_pos.items():
            pos_freq[word] = pos_freq.get(word, 0) + cnt
        for word, cnt in current_neg.items():
            neg_freq[word] = neg_freq.get(word, 0) + cnt

        # 计算情感得分(正面词数 - 负面词数)
        score = pos_count - neg_count
        # 判断情感倾向
        sentiment = 'positive' if score > 0 else 'negative' if score < 0 else 'neutral'

        results.append({
            'text': text,
            'score': score,
            'sentiment': sentiment,
            'words': words
        })

    # 统计中性词频(非情感词)
    for word in all_words:
        if word not in pos_words and word not in neg_words:
            neutral_freq[word] = neutral_freq.get(word, 0) + 1

    return results, pos_freq, neg_freq, neutral_freq

4. 统计分析模块

统计各情感类别的文件数量、占比和平均分:

def calculate_statistics(results):
    """统计情感分布:数量、占比、平均分"""
    stats = {
        'positive': {'count': 0, 'total_score': 0},
        'negative': {'count': 0, 'total_score': 0},
        'neutral': {'count': 0, 'total_score': 0}
    }

    for res in results:
        sentiment = res['sentiment']
        stats[sentiment]['count'] += 1
        stats[sentiment]['total_score'] += res['score']

    total = sum(stats[s]['count'] for s in stats)
    # 计算占比和平均分
    percentages = {s: (stats[s]['count'] / total * 100) for s in stats}
    averages = {
        s: (stats[s]['total_score'] / stats[s]['count']) 
        if stats[s]['count'] > 0 else 0 
        for s in stats
    }

    return stats, percentages, averages

5. 可视化模块

  • 柱状图:展示各情感类别的文件数量。
  • 词云:分情感生成词云(积极→绿色、消极→红色、中性→灰色)。
def plot_bar_chart(stats):
    """绘制情感分布柱状图"""
    sentiments = ['Positive', 'Negative', 'Neutral']
    counts = [stats[s]['count'] for s in ['positive', 'negative', 'neutral']]
    colors = ['green', 'red', 'gray']  # 情感对应的颜色

    plt.figure(figsize=(8, 6))
    bars = plt.bar(sentiments, counts, color=colors)
    plt.title('Emotion Distribution (File Count)')
    plt.xlabel('Sentiment')
    plt.ylabel('Number of Files')
    plt.xticks(rotation=45)

    # 标注柱子数值
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.1, 
            f'{height}', 
            ha='center', va='bottom'
        )

    plt.tight_layout()
    return plt.gcf()  # 返回图表对象


def generate_wordclouds(pos_freq, neg_freq, neutral_freq):
    """生成分情感词云(需指定中文字体路径)"""
    # 积极词云(绿色背景)
    pos_wc = WordCloud(
        background_color='green',
        width=800,
        height=400,
        font_path='simhei.ttf',  # 替换为系统中支持中文的字体路径(如Windows的simhei.ttf)
        max_words=200
    ).generate_from_frequencies(pos_freq)

    # 消极词云(红色背景)
    neg_wc = WordCloud(
        background_color='red',
        width=800,
        height=400,
        font_path='simhei.ttf',
        max_words=200
    ).generate_from_frequencies(neg_freq)

    # 中性词云(灰色背景)
    neutral_wc = WordCloud(
        background_color='gray',
        width=800,
        height=400,
        font_path='simhei.ttf',
        max_words=200
    ).generate_from_frequencies(neutral_freq)

    return pos_wc, neg_wc, neutral_wc

6. 主函数:流程整合

def main():
    # 示例文件路径(实际可通过GUI/命令行获取)
    text_files = ['story.txt', 'movie_review.txt', 'diary.txt']
    pos_dict_path = 'positive_words.txt'
    neg_dict_path = 'negative_words.txt'

    # 1. 读取文件
    texts = read_text_files(text_files)
    pos_words, neg_words = read_sentiment_dicts(pos_dict_path, neg_dict_path)

    # 2. 情感分析
    results, pos_freq, neg_freq, neutral_freq = analyze_sentiment(texts, pos_words, neg_words)

    # 3. 统计分析
    stats, percentages, averages = calculate_statistics(results)

    # 4. 输出统计报告
    print("=== Emotion Statistics Report ===")
    for s in ['positive', 'negative', 'neutral']:
        count = stats[s]['count']
        percent = percentages[s]
        avg_score = averages[s]
        print(f"{s.capitalize()} texts: {count} ({percent:.1f}%), average score: {avg_score:.1f}")

    # 5. 可视化
    # 柱状图
    bar_chart = plot_bar_chart(stats)
    bar_chart.savefig('emotion_distribution.png')

    # 词云
    pos_wc, neg_wc, neutral_wc = generate_wordclouds(pos_freq, neg_freq, neutral_freq)
    pos_wc.to_file('positive_wordcloud.png')
    neg_wc.to_file('negative_wordcloud.png')
    neutral_wc.to_file('neutral_wordcloud.png')

    # 显示图表(可选)
    plt.figure()
    plt.imshow(pos_wc)
    plt.axis('off')
    plt.title('Positive Words')
    plt.show()

    plt.figure()
    plt.imshow(neg_wc)
    plt.axis('off')
    plt.title('Negative Words')
    plt.show()

    plt.figure()
    plt.imshow(neutral_wc)
    plt.axis('off')
    plt.title('Neutral Words')
    plt.show()


if __name__ == "__main__":
    main()

运行示例

  1. 准备文件
    • 文本文件:story.txtmovie_review.txtdiary.txt(含中文文本)。
    • 情感词典:positive_words.txt(每行一个积极词,如“快乐”“精彩”)、negative_words.txt(每行一个消极词,如“悲伤”“糟糕”)。
  2. 运行代码
    执行python emotion_analysis.py,将生成:

    • 统计报告(终端输出)。
    • 柱状图:emotion_distribution.png
    • 词云图:positive_wordcloud.pngnegative_wordcloud.pngneutral_wordcloud.png

总结与扩展

该工具通过文件处理(多编码兼容)、情感分析(词典匹配+得分计算)、可视化(matplotlib+wordcloud)三大模块,实现了文本情感的本地化分析。开发者可在此基础上扩展:
– 添加GUI界面(如用tkinter/PyQt),支持文件选择和结果交互。
– 优化情感分析算法(如引入词权重、否定词处理)。
– 支持更多情感类别(如“愤怒”“喜悦”细分)。

通过该项目,学习者可掌握Python文件操作、数据分析与可视化的核心技能,快速应用于实际文本分析场景。

代码依赖

需安装以下库:

pip install matplotlib wordcloud jieba

(注:词云的font_path需根据系统调整,Windows可使用'simhei.ttf',Linux/Mac需安装中文字体并指定路径。)