nlp/exp2_people/crawl.py

"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文，存到 excel。"""
import requests, time, os, re
from bs4 import BeautifulSoup
import pandas as pd

BASE = "http://jhsjk.people.cn/"
LIST_URL = BASE + "result/1?form=706&else=501"
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
OUT_XLSX = "articles.xlsx"


def get(url):
    r = requests.get(url, headers=HEADERS, timeout=20)
    r.encoding = r.apparent_encoding
    r.raise_for_status()
    return r.text


def extract_list():
    html = get(LIST_URL)
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        m = re.match(r"^article/(\d+)$", href)
        if not m:
            continue
        title = a.get_text(strip=True)
        if not title:
            continue
        url = BASE + href
        if (url, title) not in links:
            links.append((url, title))
    return links


def extract_article(url):
    html = get(url)
    soup = BeautifulSoup(html, "lxml")
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else ""
    body = soup.select_one(".d2txt_con") or soup.select_one("#content")
    text = body.get_text("\n", strip=True) if body else ""
    return title, text


def main():
    links = extract_list()
    print(f"found {len(links)} links")
    rows = []
    for i, (url, link_title) in enumerate(links, 1):
        try:
            t, body = extract_article(url)
            title = t or link_title
            print(f"[{i}/{len(links)}] {title[:40]}  chars={len(body)}")
            rows.append({"url": url, "title": title, "content": body})
            time.sleep(0.5)
        except Exception as e:
            print(f"[{i}] failed: {e}")
    df = pd.DataFrame(rows)
    df.to_excel(OUT_XLSX, index=False)
    print(f"saved {OUT_XLSX}, {len(df)} rows")


if __name__ == "__main__":
    main()