# Python实现本地文本情感分析与可视化工具

背景介绍

在影评分析、日记情绪追踪、小说情感走向分析等场景中，文本情感分析能帮助我们快速理解文本的情绪倾向。本文将介绍如何开发一个本地化工具，支持用户上传多个文本文件和自定义情感词典，通过词匹配分析情感倾向，并以柱状图和分情感词云直观展示结果。该工具基于Python开发，结合文件处理、数据分析与可视化技术，适合Python中级以下开发者学习。

思路分析

工具的核心流程分为四步：
1. 文件读取：读取多文本文件（支持UTF-8/GBK编码）和自定义情感词典（正面、负面词列表）。
2. 情感分析：分词后匹配情感词典计算得分，判断情感倾向（积极/消极/中性）。
3. 统计分析：统计各情感类别的文件数量、占比和平均分。
4. 可视化：用matplotlib绘制柱状图展示分布，用wordcloud生成分情感词云（不同颜色区分）。

代码实现

1. 依赖库导入

需安装matplotlib、wordcloud、jieba（中文分词）：

pip install matplotlib wordcloud jieba

代码中导入核心库：

import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba

2. 文件读取模块

处理多文本文件和情感词典的读取，兼容不同编码：

def read_text_files(file_paths):
    """读取多个文本文件内容，兼容UTF-8/GBK编码"""
    texts = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                texts.append(f.read())
        except UnicodeDecodeError:
            # 尝试GBK编码（常见中文文件编码）
            with open(file_path, 'r', encoding='gbk') as f:
                texts.append(f.read())
    return texts

def read_sentiment_dicts(pos_dict_path, neg_dict_path):
    """读取正面/负面情感词典，返回词集合"""
    pos_words, neg_words = set(), set()
    # 读取正面词典
    with open(pos_dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            if word:  # 过滤空行
                pos_words.add(word)
    # 读取负面词典
    with open(neg_dict_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            if word:
                neg_words.add(word)
    return pos_words, neg_words

3. 情感分析模块

对文本分词（中文用jieba），匹配情感词典计算得分，判断情感倾向：

def analyze_sentiment(texts, pos_words, neg_words):
    """分析文本情感：分词→匹配词典→计算得分→判断倾向"""
    results = []  # 存储每个文本的分析结果
    pos_freq, neg_freq, neutral_freq = {}, {}, {}  # 各情感词的词频
    all_words = []  # 所有文本的分词结果（用于中性词统计）

    for text in texts:
        # 中文分词（用jieba切割文本）
        words = jieba.lcut(text)
        all_words.extend(words)

        pos_count, neg_count = 0, 0  # 正面/负面词数量
        current_pos, current_neg = {}, {}  # 当前文本的情感词频

        for word in words:
            if word in pos_words:
                pos_count += 1
                current_pos[word] = current_pos.get(word, 0) + 1
            elif word in neg_words:
                neg_count += 1
                current_neg[word] = current_neg.get(word, 0) + 1

        # 更新全局情感词频
        for word, cnt in current_pos.items():
            pos_freq[word] = pos_freq.get(word, 0) + cnt
        for word, cnt in current_neg.items():
            neg_freq[word] = neg_freq.get(word, 0) + cnt

        # 计算情感得分（正面词数 - 负面词数）
        score = pos_count - neg_count
        # 判断情感倾向
        sentiment = 'positive' if score > 0 else 'negative' if score < 0 else 'neutral'

        results.append({
            'text': text,
            'score': score,
            'sentiment': sentiment,
            'words': words
        })

    # 统计中性词频（非情感词）
    for word in all_words:
        if word not in pos_words and word not in neg_words:
            neutral_freq[word] = neutral_freq.get(word, 0) + 1

    return results, pos_freq, neg_freq, neutral_freq

4. 统计分析模块

统计各情感类别的文件数量、占比和平均分：

def calculate_statistics(results):
    """统计情感分布：数量、占比、平均分"""
    stats = {
        'positive': {'count': 0, 'total_score': 0},
        'negative': {'count': 0, 'total_score': 0},
        'neutral': {'count': 0, 'total_score': 0}
    }

    for res in results:
        sentiment = res['sentiment']
        stats[sentiment]['count'] += 1
        stats[sentiment]['total_score'] += res['score']

    total = sum(stats[s]['count'] for s in stats)
    # 计算占比和平均分
    percentages = {s: (stats[s]['count'] / total * 100) for s in stats}
    averages = {
        s: (stats[s]['total_score'] / stats[s]['count']) 
        if stats[s]['count'] > 0 else 0 
        for s in stats
    }

    return stats, percentages, averages

5. 可视化模块

柱状图：展示各情感类别的文件数量。
词云：分情感生成词云（积极→绿色、消极→红色、中性→灰色）。

def plot_bar_chart(stats):
    """绘制情感分布柱状图"""
    sentiments = ['Positive', 'Negative', 'Neutral']
    counts = [stats[s]['count'] for s in ['positive', 'negative', 'neutral']]
    colors = ['green', 'red', 'gray']  # 情感对应的颜色

    plt.figure(figsize=(8, 6))
    bars = plt.bar(sentiments, counts, color=colors)
    plt.title('Emotion Distribution (File Count)')
    plt.xlabel('Sentiment')
    plt.ylabel('Number of Files')
    plt.xticks(rotation=45)

    # 标注柱子数值
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.1, 
            f'{height}', 
            ha='center', va='bottom'
        )

    plt.tight_layout()
    return plt.gcf()  # 返回图表对象


def generate_wordclouds(pos_freq, neg_freq, neutral_freq):
    """生成分情感词云（需指定中文字体路径）"""
    # 积极词云（绿色背景）
    pos_wc = WordCloud(
        background_color='green',
        width=800,
        height=400,
        font_path='simhei.ttf',  # 替换为系统中支持中文的字体路径（如Windows的simhei.ttf）
        max_words=200
    ).generate_from_frequencies(pos_freq)

    # 消极词云（红色背景）
    neg_wc = WordCloud(
        background_color='red',
        width=800,
        height=400,
        font_path='simhei.ttf',
        max_words=200
    ).generate_from_frequencies(neg_freq)

    # 中性词云（灰色背景）
    neutral_wc = WordCloud(
        background_color='gray',
        width=800,
        height=400,
        font_path='simhei.ttf',
        max_words=200
    ).generate_from_frequencies(neutral_freq)

    return pos_wc, neg_wc, neutral_wc

6. 主函数：流程整合

def main():
    # 示例文件路径（实际可通过GUI/命令行获取）
    text_files = ['story.txt', 'movie_review.txt', 'diary.txt']
    pos_dict_path = 'positive_words.txt'
    neg_dict_path = 'negative_words.txt'

    # 1. 读取文件
    texts = read_text_files(text_files)
    pos_words, neg_words = read_sentiment_dicts(pos_dict_path, neg_dict_path)

    # 2. 情感分析
    results, pos_freq, neg_freq, neutral_freq = analyze_sentiment(texts, pos_words, neg_words)

    # 3. 统计分析
    stats, percentages, averages = calculate_statistics(results)

    # 4. 输出统计报告
    print("=== Emotion Statistics Report ===")
    for s in ['positive', 'negative', 'neutral']:
        count = stats[s]['count']
        percent = percentages[s]
        avg_score = averages[s]
        print(f"{s.capitalize()} texts: {count} ({percent:.1f}%), average score: {avg_score:.1f}")

    # 5. 可视化
    # 柱状图
    bar_chart = plot_bar_chart(stats)
    bar_chart.savefig('emotion_distribution.png')

    # 词云
    pos_wc, neg_wc, neutral_wc = generate_wordclouds(pos_freq, neg_freq, neutral_freq)
    pos_wc.to_file('positive_wordcloud.png')
    neg_wc.to_file('negative_wordcloud.png')
    neutral_wc.to_file('neutral_wordcloud.png')

    # 显示图表（可选）
    plt.figure()
    plt.imshow(pos_wc)
    plt.axis('off')
    plt.title('Positive Words')
    plt.show()

    plt.figure()
    plt.imshow(neg_wc)
    plt.axis('off')
    plt.title('Negative Words')
    plt.show()

    plt.figure()
    plt.imshow(neutral_wc)
    plt.axis('off')
    plt.title('Neutral Words')
    plt.show()


if __name__ == "__main__":
    main()

运行示例

准备文件：
- 文本文件：story.txt、movie_review.txt、diary.txt（含中文文本）。
- 情感词典：positive_words.txt（每行一个积极词，如“快乐”“精彩”）、negative_words.txt（每行一个消极词，如“悲伤”“糟糕”）。
运行代码：
执行python emotion_analysis.py，将生成：
- 统计报告（终端输出）。
- 柱状图：emotion_distribution.png。
- 词云图：positive_wordcloud.png、negative_wordcloud.png、neutral_wordcloud.png。

总结与扩展

该工具通过文件处理（多编码兼容）、情感分析（词典匹配+得分计算）、可视化（matplotlib+wordcloud）三大模块，实现了文本情感的本地化分析。开发者可在此基础上扩展：
– 添加GUI界面（如用tkinter/PyQt），支持文件选择和结果交互。
– 优化情感分析算法（如引入词权重、否定词处理）。
– 支持更多情感类别（如“愤怒”“喜悦”细分）。

通过该项目，学习者可掌握Python文件操作、数据分析与可视化的核心技能，快速应用于实际文本分析场景。

代码依赖

需安装以下库：

pip install matplotlib wordcloud jieba

（注：词云的font_path需根据系统调整，Windows可使用'simhei.ttf'，Linux/Mac需安装中文字体并指定路径。）