"""对爬到的文章分词统计 + 词云。""" import pandas as pd import jieba import jieba.posseg as pseg from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from matplotlib import font_manager plt.rcParams["axes.unicode_minus"] = False for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf", "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]: import os as _os if _os.path.exists(_fp): font_manager.fontManager.addfont(_fp) plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()] break INPUT = "articles.xlsx" FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf" FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc" import os if not os.path.exists(FONT): FONT = FONT_FALLBACK STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们") # 自定义词典(实验要求:添加新词) for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕", "高质量发展", "一带一路", "人类命运共同体", "火神山"]: jieba.add_word(w, freq=1000) def main(): df = pd.read_excel(INPUT) all_text = "\n".join(df["content"].fillna("").astype(str).tolist()) print(f"total chars={len(all_text)}") nouns, verbs, all_words = Counter(), Counter(), Counter() for w, flag in pseg.cut(all_text): w = w.strip() if len(w) < 2 or w in STOP: continue if not any("\u4e00" <= c <= "\u9fff" for c in w): continue all_words[w] += 1 if flag.startswith("n"): nouns[w] += 1 elif flag.startswith("v"): verbs[w] += 1 print("\n=== Top20 名词 ===") for w, c in nouns.most_common(20): print(f" {w}\t{c}") print("\n=== Top20 动词 ===") for w, c in verbs.most_common(20): print(f" {w}\t{c}") pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False) pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False) # 词云(全部词) wc = WordCloud(font_path=FONT, width=1200, height=800, background_color="white", max_words=200) wc.generate_from_frequencies(all_words) wc.to_file("wordcloud_all.png") # 名词 / 动词 分别词云 WordCloud(font_path=FONT, width=1000, height=700, background_color="white", colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png") WordCloud(font_path=FONT, width=1000, height=700, background_color="white", colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png") # 组合图 fig, axes = plt.subplots(1, 3, figsize=(21, 7)) for ax, path, title in zip(axes, ["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"], ["整体", "名词", "动词"]): ax.imshow(plt.imread(path)) ax.set_title(title, fontsize=20) ax.axis("off") plt.tight_layout() plt.savefig("wordclouds_combined.png", dpi=120) print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png") if __name__ == "__main__": main()