nlp/exp2_people/analyze.py
2026-04-29 18:34:27 +08:00

88 lines
3.4 KiB
Python

"""对爬到的文章分词统计 + 词云。"""
import pandas as pd
import jieba
import jieba.posseg as pseg
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib import font_manager
plt.rcParams["axes.unicode_minus"] = False
for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf",
"/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]:
import os as _os
if _os.path.exists(_fp):
font_manager.fontManager.addfont(_fp)
plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()]
break
INPUT = "articles.xlsx"
FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf"
FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"
import os
if not os.path.exists(FONT):
FONT = FONT_FALLBACK
STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们")
# 自定义词典(实验要求:添加新词)
for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕",
"高质量发展", "一带一路", "人类命运共同体", "火神山"]:
jieba.add_word(w, freq=1000)
def main():
df = pd.read_excel(INPUT)
all_text = "\n".join(df["content"].fillna("").astype(str).tolist())
print(f"total chars={len(all_text)}")
nouns, verbs, all_words = Counter(), Counter(), Counter()
for w, flag in pseg.cut(all_text):
w = w.strip()
if len(w) < 2 or w in STOP:
continue
if not any("\u4e00" <= c <= "\u9fff" for c in w):
continue
all_words[w] += 1
if flag.startswith("n"):
nouns[w] += 1
elif flag.startswith("v"):
verbs[w] += 1
print("\n=== Top20 名词 ===")
for w, c in nouns.most_common(20):
print(f" {w}\t{c}")
print("\n=== Top20 动词 ===")
for w, c in verbs.most_common(20):
print(f" {w}\t{c}")
pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False)
pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False)
# 词云(全部词)
wc = WordCloud(font_path=FONT, width=1200, height=800,
background_color="white", max_words=200)
wc.generate_from_frequencies(all_words)
wc.to_file("wordcloud_all.png")
# 名词 / 动词 分别词云
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png")
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png")
# 组合图
fig, axes = plt.subplots(1, 3, figsize=(21, 7))
for ax, path, title in zip(axes,
["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"],
["整体", "名词", "动词"]):
ax.imshow(plt.imread(path))
ax.set_title(title, fontsize=20)
ax.axis("off")
plt.tight_layout()
plt.savefig("wordclouds_combined.png", dpi=120)
print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png")
if __name__ == "__main__":
main()