nlp/exp2_people/crawl.py
2026-04-29 18:34:27 +08:00

67 lines
1.9 KiB
Python

"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文,存到 excel。"""
import requests, time, os, re
from bs4 import BeautifulSoup
import pandas as pd
BASE = "http://jhsjk.people.cn/"
LIST_URL = BASE + "result/1?form=706&else=501"
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
OUT_XLSX = "articles.xlsx"
def get(url):
r = requests.get(url, headers=HEADERS, timeout=20)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
def extract_list():
html = get(LIST_URL)
soup = BeautifulSoup(html, "lxml")
links = []
for a in soup.find_all("a", href=True):
href = a["href"]
m = re.match(r"^article/(\d+)$", href)
if not m:
continue
title = a.get_text(strip=True)
if not title:
continue
url = BASE + href
if (url, title) not in links:
links.append((url, title))
return links
def extract_article(url):
html = get(url)
soup = BeautifulSoup(html, "lxml")
h1 = soup.find("h1")
title = h1.get_text(strip=True) if h1 else ""
body = soup.select_one(".d2txt_con") or soup.select_one("#content")
text = body.get_text("\n", strip=True) if body else ""
return title, text
def main():
links = extract_list()
print(f"found {len(links)} links")
rows = []
for i, (url, link_title) in enumerate(links, 1):
try:
t, body = extract_article(url)
title = t or link_title
print(f"[{i}/{len(links)}] {title[:40]} chars={len(body)}")
rows.append({"url": url, "title": title, "content": body})
time.sleep(0.5)
except Exception as e:
print(f"[{i}] failed: {e}")
df = pd.DataFrame(rows)
df.to_excel(OUT_XLSX, index=False)
print(f"saved {OUT_XLSX}, {len(df)} rows")
if __name__ == "__main__":
main()