"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文,存到 excel。""" import requests, time, os, re from bs4 import BeautifulSoup import pandas as pd BASE = "http://jhsjk.people.cn/" LIST_URL = BASE + "result/1?form=706&else=501" HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"} OUT_XLSX = "articles.xlsx" def get(url): r = requests.get(url, headers=HEADERS, timeout=20) r.encoding = r.apparent_encoding r.raise_for_status() return r.text def extract_list(): html = get(LIST_URL) soup = BeautifulSoup(html, "lxml") links = [] for a in soup.find_all("a", href=True): href = a["href"] m = re.match(r"^article/(\d+)$", href) if not m: continue title = a.get_text(strip=True) if not title: continue url = BASE + href if (url, title) not in links: links.append((url, title)) return links def extract_article(url): html = get(url) soup = BeautifulSoup(html, "lxml") h1 = soup.find("h1") title = h1.get_text(strip=True) if h1 else "" body = soup.select_one(".d2txt_con") or soup.select_one("#content") text = body.get_text("\n", strip=True) if body else "" return title, text def main(): links = extract_list() print(f"found {len(links)} links") rows = [] for i, (url, link_title) in enumerate(links, 1): try: t, body = extract_article(url) title = t or link_title print(f"[{i}/{len(links)}] {title[:40]} chars={len(body)}") rows.append({"url": url, "title": title, "content": body}) time.sleep(0.5) except Exception as e: print(f"[{i}] failed: {e}") df = pd.DataFrame(rows) df.to_excel(OUT_XLSX, index=False) print(f"saved {OUT_XLSX}, {len(df)} rows") if __name__ == "__main__": main()