88 lines
3.4 KiB
Python
88 lines
3.4 KiB
Python
"""对爬到的文章分词统计 + 词云。"""
|
|
import pandas as pd
|
|
import jieba
|
|
import jieba.posseg as pseg
|
|
from collections import Counter
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
from matplotlib import font_manager
|
|
plt.rcParams["axes.unicode_minus"] = False
|
|
for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf",
|
|
"/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]:
|
|
import os as _os
|
|
if _os.path.exists(_fp):
|
|
font_manager.fontManager.addfont(_fp)
|
|
plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()]
|
|
break
|
|
|
|
INPUT = "articles.xlsx"
|
|
FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf"
|
|
FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"
|
|
import os
|
|
if not os.path.exists(FONT):
|
|
FONT = FONT_FALLBACK
|
|
|
|
STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们")
|
|
|
|
# 自定义词典(实验要求:添加新词)
|
|
for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕",
|
|
"高质量发展", "一带一路", "人类命运共同体", "火神山"]:
|
|
jieba.add_word(w, freq=1000)
|
|
|
|
|
|
def main():
|
|
df = pd.read_excel(INPUT)
|
|
all_text = "\n".join(df["content"].fillna("").astype(str).tolist())
|
|
print(f"total chars={len(all_text)}")
|
|
|
|
nouns, verbs, all_words = Counter(), Counter(), Counter()
|
|
for w, flag in pseg.cut(all_text):
|
|
w = w.strip()
|
|
if len(w) < 2 or w in STOP:
|
|
continue
|
|
if not any("\u4e00" <= c <= "\u9fff" for c in w):
|
|
continue
|
|
all_words[w] += 1
|
|
if flag.startswith("n"):
|
|
nouns[w] += 1
|
|
elif flag.startswith("v"):
|
|
verbs[w] += 1
|
|
|
|
print("\n=== Top20 名词 ===")
|
|
for w, c in nouns.most_common(20):
|
|
print(f" {w}\t{c}")
|
|
print("\n=== Top20 动词 ===")
|
|
for w, c in verbs.most_common(20):
|
|
print(f" {w}\t{c}")
|
|
|
|
pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False)
|
|
pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False)
|
|
|
|
# 词云(全部词)
|
|
wc = WordCloud(font_path=FONT, width=1200, height=800,
|
|
background_color="white", max_words=200)
|
|
wc.generate_from_frequencies(all_words)
|
|
wc.to_file("wordcloud_all.png")
|
|
|
|
# 名词 / 动词 分别词云
|
|
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
|
|
colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png")
|
|
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
|
|
colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png")
|
|
|
|
# 组合图
|
|
fig, axes = plt.subplots(1, 3, figsize=(21, 7))
|
|
for ax, path, title in zip(axes,
|
|
["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"],
|
|
["整体", "名词", "动词"]):
|
|
ax.imshow(plt.imread(path))
|
|
ax.set_title(title, fontsize=20)
|
|
ax.axis("off")
|
|
plt.tight_layout()
|
|
plt.savefig("wordclouds_combined.png", dpi=120)
|
|
print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|