67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文,存到 excel。"""
|
|
import requests, time, os, re
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
|
|
BASE = "http://jhsjk.people.cn/"
|
|
LIST_URL = BASE + "result/1?form=706&else=501"
|
|
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
|
|
OUT_XLSX = "articles.xlsx"
|
|
|
|
|
|
def get(url):
|
|
r = requests.get(url, headers=HEADERS, timeout=20)
|
|
r.encoding = r.apparent_encoding
|
|
r.raise_for_status()
|
|
return r.text
|
|
|
|
|
|
def extract_list():
|
|
html = get(LIST_URL)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
links = []
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"]
|
|
m = re.match(r"^article/(\d+)$", href)
|
|
if not m:
|
|
continue
|
|
title = a.get_text(strip=True)
|
|
if not title:
|
|
continue
|
|
url = BASE + href
|
|
if (url, title) not in links:
|
|
links.append((url, title))
|
|
return links
|
|
|
|
|
|
def extract_article(url):
|
|
html = get(url)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
h1 = soup.find("h1")
|
|
title = h1.get_text(strip=True) if h1 else ""
|
|
body = soup.select_one(".d2txt_con") or soup.select_one("#content")
|
|
text = body.get_text("\n", strip=True) if body else ""
|
|
return title, text
|
|
|
|
|
|
def main():
|
|
links = extract_list()
|
|
print(f"found {len(links)} links")
|
|
rows = []
|
|
for i, (url, link_title) in enumerate(links, 1):
|
|
try:
|
|
t, body = extract_article(url)
|
|
title = t or link_title
|
|
print(f"[{i}/{len(links)}] {title[:40]} chars={len(body)}")
|
|
rows.append({"url": url, "title": title, "content": body})
|
|
time.sleep(0.5)
|
|
except Exception as e:
|
|
print(f"[{i}] failed: {e}")
|
|
df = pd.DataFrame(rows)
|
|
df.to_excel(OUT_XLSX, index=False)
|
|
print(f"saved {OUT_XLSX}, {len(df)} rows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|