Initial Commit

This commit is contained in:
grtsinry43 2026-04-29 18:34:27 +08:00
commit a4ec0b90da
Signed by: grtsinry43
GPG Key ID: F3305FB3A978C934
32 changed files with 229785 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
best.pt
.codex

View File

@ -0,0 +1,3 @@
[General]
logo=
name=

Binary file not shown.

View File

@ -0,0 +1 @@
["B", "M", "E", "S"]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,14 @@
{
"backend": "tokenizers",
"cls_token": "[CLS]",
"do_lower_case": true,
"is_local": false,
"mask_token": "[MASK]",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}

View File

@ -0,0 +1,129 @@
"""用训练好的 BERT-BMES 做 test 分词。"""
import csv, torch
import torch.nn as nn
from transformers import BertTokenizerFast, BertModel
CKPT = "ckpt/best.pt"
TOK_DIR = "ckpt"
MODEL_NAME = "hfl/chinese-bert-wwm-ext"
TEST = "../test.csv"
OUT = "../submission_bert.csv"
MAX_LEN = 510
BATCH = 32
LABELS = ["B", "M", "E", "S"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class BertTagger(nn.Module):
def __init__(self, name, n):
super().__init__()
self.bert = BertModel.from_pretrained(name)
self.drop = nn.Dropout(0.1)
self.cls = nn.Linear(self.bert.config.hidden_size, n)
def forward(self, ids, attn):
return self.cls(self.drop(self.bert(ids, attention_mask=attn).last_hidden_state))
def chunk_chars(chars, max_len):
return [chars[i:i + max_len] for i in range(0, len(chars), max_len)]
def fix_tags(tags):
out = []; prev = None
for t in tags:
if prev in ("B", "M") and t in ("B", "S"):
if prev == "B": out[-1] = "S"
else: out[-1] = "E"
out.append(t); prev = t
return out
def tags_to_words(chars, tags):
words, buf = [], ""
for c, t in zip(chars, tags):
if t == "B":
if buf: words.append(buf)
buf = c
elif t == "M":
buf += c
elif t == "E":
buf += c; words.append(buf); buf = ""
else:
if buf: words.append(buf); buf = ""
words.append(c)
if buf: words.append(buf)
return words
def transfer(words):
cnt = 0; out = []
for w in words:
idx = list(range(cnt, cnt + len(w)))
out.append(str(idx).replace(" ", ""))
cnt += len(w)
return " ".join(out)
def main():
tok = BertTokenizerFast.from_pretrained(TOK_DIR)
model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
model.load_state_dict(torch.load(CKPT, map_location=device))
model.eval()
with open(TEST, encoding="utf-8") as f:
reader = csv.reader(f); next(reader)
rows = [r for r in reader if len(r) >= 2]
jobs = []
all_tags = {}
for i, (_, sent) in enumerate(rows):
chars = list(sent)
for s, sub in enumerate(chunk_chars(chars, MAX_LEN)):
jobs.append((i, s, sub))
jobs.sort(key=lambda x: len(x[2]))
with torch.no_grad():
for b in range(0, len(jobs), BATCH):
batch = jobs[b:b + BATCH]
chars_list = [j[2] for j in batch]
enc = tok(chars_list, is_split_into_words=True, truncation=True,
max_length=MAX_LEN + 2, padding=True, return_tensors="pt")
ids = enc["input_ids"].to(device)
attn = enc["attention_mask"].to(device)
with torch.amp.autocast("cuda", dtype=torch.float16):
logits = model(ids, attn)
preds = logits.argmax(-1).cpu().numpy()
for k, (i, s, chars) in enumerate(batch):
wids = enc.word_ids(batch_index=k)
tags = []
for j, wid in enumerate(wids):
if wid is None: continue
if len(tags) == wid:
tags.append(LABELS[preds[k][j]])
all_tags[(i, s)] = tags
if (b // BATCH) % 20 == 0:
print(f" {b}/{len(jobs)}")
with open(OUT, "w", encoding="utf-8", newline="") as fo:
w = csv.writer(fo)
w.writerow(["ID", "expected"])
for i, (sid, sent) in enumerate(rows):
chars = list(sent)
full_tags = []
s = 0
while (i, s) in all_tags:
full_tags.extend(all_tags[(i, s)])
s += 1
if len(full_tags) != len(chars):
while len(full_tags) < len(chars):
full_tags.append("S")
full_tags = full_tags[:len(chars)]
full_tags = fix_tags(full_tags)
words = tags_to_words(chars, full_tags)
w.writerow([sid, transfer(words)])
print(f"wrote {OUT}")
if __name__ == "__main__":
main()

54
exp1_fenci/bert/train.log Normal file
View File

@ -0,0 +1,54 @@
train=85180 val=1738
Loading weights: 0%| | 0/199 [00:00<?, ?it/s] Loading weights: 100%|██████████| 199/199 [00:00<00:00, 42105.96it/s]
BertModel LOAD REPORT from: hfl/chinese-bert-wwm-ext
Key | Status | |
-------------------------------------------+------------+--+-
cls.seq_relationship.bias | UNEXPECTED | |
cls.predictions.transform.LayerNorm.bias | UNEXPECTED | |
cls.predictions.bias | UNEXPECTED | |
cls.predictions.transform.dense.bias | UNEXPECTED | |
cls.predictions.transform.dense.weight | UNEXPECTED | |
cls.seq_relationship.weight | UNEXPECTED | |
cls.predictions.decoder.weight | UNEXPECTED | |
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | |
Notes:
- UNEXPECTED: can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Exception in thread Thread-auto_conversion:
Traceback (most recent call last):
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 761, in hf_raise_for_status
response.raise_for_status()
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/httpx/_models.py", line 829, in raise_for_status
raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 117, in auto_conversion
raise e
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 96, in auto_conversion
sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 69, in get_conversion_pr_reference
pr = previous_pr(api, model_id, pr_title, token=token)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 14, in previous_pr
for discussion in get_repo_discussions(repo_id=model_id, token=token):
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6949, in get_repo_discussions
discussions, has_next = _fetch_discussion_page(page_index=page_index)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6938, in _fetch_discussion_page
hf_raise_for_status(resp)
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 849, in hf_raise_for_status
raise _format(HfHubHTTPError, message, response) from e
huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e66348-038e57a352b76902655292cb;7091e990-eb43-487d-900e-ca182e86423e)
403 Forbidden: Discussions are disabled for this repo.
Cannot access content at: https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0.
Make sure your token has the correct permissions.

View File

@ -0,0 +1,177 @@
"""BERT 字符级 BMES 分词训练。"""
import csv, os, random, json, time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
TRAIN = "../train.csv"
MODEL_NAME = "hfl/chinese-bert-wwm-ext"
SAVE_DIR = "ckpt"
MAX_LEN = 128
BATCH = 32
EPOCHS = 3
LR = 3e-5
SEED = 42
VAL_RATIO = 0.02
LABELS = ["B", "M", "E", "S"]
L2I = {l: i for i, l in enumerate(LABELS)}
PAD_ID = -100
random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def tag_word(w):
if len(w) == 1:
return ["S"]
return ["B"] + ["M"] * (len(w) - 2) + ["E"]
def load_pairs():
pairs = []
with open(TRAIN, encoding="utf-8") as f:
r = csv.reader(f); next(r)
for row in r:
if not row: continue
words = [w for w in row[0].strip().split(" ") if w]
if not words: continue
tags, chars = [], []
for w in words:
tags.extend(tag_word(w)); chars.extend(list(w))
pairs.append((chars, tags))
return pairs
class SegDS(Dataset):
def __init__(self, pairs, tok):
self.pairs, self.tok = pairs, tok
def __len__(self): return len(self.pairs)
def __getitem__(self, i):
chars, tags = self.pairs[i]
chars = chars[:MAX_LEN - 2]
tags = tags[:MAX_LEN - 2]
enc = self.tok(chars, is_split_into_words=True, truncation=True, max_length=MAX_LEN,
padding="max_length", return_tensors="pt")
input_ids = enc["input_ids"].squeeze(0)
attn = enc["attention_mask"].squeeze(0)
word_ids = enc.word_ids(batch_index=0)
labels = []
for wid in word_ids:
if wid is None:
labels.append(PAD_ID)
else:
labels.append(L2I[tags[wid]])
labels = torch.tensor(labels, dtype=torch.long)
return input_ids, attn, labels
class BertTagger(nn.Module):
def __init__(self, name, n_labels):
super().__init__()
self.bert = BertModel.from_pretrained(name)
self.drop = nn.Dropout(0.1)
self.cls = nn.Linear(self.bert.config.hidden_size, n_labels)
def forward(self, ids, attn):
out = self.bert(ids, attention_mask=attn).last_hidden_state
return self.cls(self.drop(out))
def seg_f1(gold_tags, pred_tags):
def to_segs(tags):
r = []; start = None
for i, t in enumerate(tags):
if t == "B":
if start is not None: r.append((start, i - 1))
start = i
elif t == "M":
if start is None: start = i
elif t == "E":
if start is None: start = i
r.append((start, i)); start = None
elif t == "S":
if start is not None: r.append((start, i - 1))
r.append((i, i)); start = None
if start is not None: r.append((start, len(tags) - 1))
return set(r)
g, p = to_segs(gold_tags), to_segs(pred_tags)
tp = len(g & p)
if tp == 0: return 0.0
P, R = tp / len(p), tp / len(g)
return 2 * P * R / (P + R)
def run_val(model, loader):
model.eval()
f1s = []
with torch.no_grad():
for ids, attn, labels in loader:
ids, attn = ids.to(device), attn.to(device)
logits = model(ids, attn)
preds = logits.argmax(-1).cpu().numpy()
labels = labels.numpy()
for p, l in zip(preds, labels):
gold = [LABELS[x] for x in l if x != PAD_ID]
mask = l != PAD_ID
pr = [LABELS[x] for x, m in zip(p, mask) if m]
if gold:
f1s.append(seg_f1(gold, pr))
return sum(f1s) / max(len(f1s), 1)
def main():
os.makedirs(SAVE_DIR, exist_ok=True)
tok = BertTokenizerFast.from_pretrained(MODEL_NAME)
pairs = load_pairs()
random.shuffle(pairs)
n_val = int(len(pairs) * VAL_RATIO)
val, train = pairs[:n_val], pairs[n_val:]
print(f"train={len(train)} val={len(val)}")
tr_loader = DataLoader(SegDS(train, tok), batch_size=BATCH, shuffle=True, num_workers=2, pin_memory=True)
va_loader = DataLoader(SegDS(val, tok), batch_size=BATCH * 2, num_workers=2, pin_memory=True)
model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
opt = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps = len(tr_loader) * EPOCHS
sched = get_linear_schedule_with_warmup(opt, int(0.1 * total_steps), total_steps)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
scaler = torch.amp.GradScaler("cuda")
best = 0.0
step = 0
for epoch in range(EPOCHS):
model.train()
t0 = time.time()
running = 0.0
for ids, attn, labels in tr_loader:
ids, attn, labels = ids.to(device), attn.to(device), labels.to(device)
opt.zero_grad()
with torch.amp.autocast("cuda", dtype=torch.float16):
logits = model(ids, attn)
loss = loss_fn(logits.view(-1, len(LABELS)), labels.view(-1))
scaler.scale(loss).backward()
scaler.unscale_(opt)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(opt); scaler.update(); sched.step()
running += loss.item(); step += 1
if step % 200 == 0:
print(f" ep{epoch+1} step{step} loss={running/200:.4f} lr={sched.get_last_lr()[0]:.2e}")
running = 0.0
f1 = run_val(model, va_loader)
dt = time.time() - t0
print(f"[epoch {epoch+1}] val F1={f1:.4f} time={dt:.1f}s")
if f1 > best:
best = f1
torch.save(model.state_dict(), f"{SAVE_DIR}/best.pt")
tok.save_pretrained(SAVE_DIR)
with open(f"{SAVE_DIR}/labels.json", "w") as f:
json.dump(LABELS, f)
print(f" saved best -> {SAVE_DIR}/best.pt")
print(f"best val F1 = {best:.4f}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

Binary file not shown.

1
exp1_fenci/hmm.json Normal file

File diff suppressed because one or more lines are too long

162
exp1_fenci/predict.py Normal file
View File

@ -0,0 +1,162 @@
"""用 HMM + 最大匹配词典混合的方式分词。
策略先用高频词典最大匹配对未匹配部分用 HMM Viterbi
"""
import csv, json, sys
from collections import defaultdict
MODEL = "hmm.json"
TEST = "test.csv"
OUT = "submission.csv"
FREQ = "chinese_word_freq_list.txt"
STATES = ("B", "M", "E", "S")
def load_model():
with open(MODEL, encoding="utf-8") as f:
return json.load(f)
def load_dict():
words = set()
with open(FREQ, encoding="utf-8") as f:
for line in f:
parts = line.strip().split()
if len(parts) >= 2:
w = parts[1]
if len(w) >= 2:
words.add(w)
return words
def viterbi(chars, model):
if not chars:
return []
init = model["init"]
trans = model["trans"]
emit = model["emit"]
emit_def = model["emit_default"]
V = [{}]
path = {}
for s in STATES:
e = emit[s].get(chars[0], emit_def[s])
V[0][s] = init[s] + e
path[s] = [s]
for t in range(1, len(chars)):
V.append({})
new_path = {}
for s in STATES:
e = emit[s].get(chars[t], emit_def[s])
best_p, best_prev = max(
(V[t - 1][p] + trans[p][s] + e, p) for p in STATES
)
# 限制B/S 开头E/S 结尾B->M/EM->M/EE->B/SS->B/S
V[t][s] = best_p
new_path[s] = path[best_prev] + [s]
path = new_path
# 末位必须是 E 或 S
best_p, best_s = max((V[-1][s], s) for s in ("E", "S"))
return path[best_s]
def seg_by_tags(chars, tags):
words = []
buf = ""
for c, t in zip(chars, tags):
if t == "B":
if buf:
words.append(buf)
buf = c
elif t == "M":
buf += c
elif t == "E":
buf += c
words.append(buf)
buf = ""
else: # S
if buf:
words.append(buf)
buf = ""
words.append(c)
if buf:
words.append(buf)
return words
def max_match(chars, word_set, max_len=6):
"""正向最大匹配返回 segments 列表,每个元素是 (start, end) 左闭右开。"""
n = len(chars)
i = 0
segs = []
while i < n:
matched = False
for L in range(min(max_len, n - i), 1, -1):
w = "".join(chars[i:i + L])
if w in word_set:
segs.append((i, i + L))
i += L
matched = True
break
if not matched:
i += 1
return segs
def hybrid_segment(sentence, model, word_set):
chars = list(sentence)
n = len(chars)
if n == 0:
return []
locked = max_match(chars, word_set)
# 在 locked 之外用 HMM
result = []
cur = 0
for a, b in locked:
if cur < a:
sub = chars[cur:a]
tags = viterbi(sub, model)
result.extend(seg_by_tags(sub, tags))
result.append("".join(chars[a:b]))
cur = b
if cur < n:
sub = chars[cur:]
tags = viterbi(sub, model)
result.extend(seg_by_tags(sub, tags))
return result
def transfer(words):
"""['','','自然','语言'] -> '[0] [1] [2,3] [4,5]'"""
count = 0
out = []
for w in words:
idx = list(range(count, count + len(w)))
out.append(str(idx).replace(" ", ""))
count += len(w)
return " ".join(out)
def main():
model = load_model()
word_set = load_dict()
print(f"dict size={len(word_set)}")
with open(TEST, encoding="utf-8") as f, open(OUT, "w", encoding="utf-8", newline="") as out:
reader = csv.reader(f)
writer = csv.writer(out)
next(reader) # id,sentence
writer.writerow(["id", "expected"])
for row in reader:
if len(row) < 2:
continue
sid, sent = row[0], row[1]
words = hybrid_segment(sent, model, word_set)
writer.writerow([sid, transfer(words)])
print(f"wrote {OUT}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,11 @@
id,expected
1,"[0,1] [2,3] [4] [5] [6,7] [8,9] [10] [11,12]"
2,"[0,1] [2] [3,4] [5,6] [7] [8,9] [10]"
3,"[0,1,2] [3,4] [5,6] [7] [8] [9] [10] [11,12] [13] [14] [15,16] [17,18] [19] [20,21,22] [23]"
4,"[0,1] [2,3] [4] [5,6] [7] [8] [9,10] [11,12] [13] [14,15] [16,17,18,19] [20] [21,22,23] [24,25] [26,27] [28] [29] [30]"
5,"[0] [1] [2] [3] [4] [5] [6,7] [8,9] [10,11] [12] [13] [14] [15,16] [17] [18,19] [20,21] [22,23] [24] [25,26] [27,28] [29]"
6,"[0,1] [2] [3] [4] [5,6] [7,8] [9] [10,11] [12,13] [14,15] [16,17] [18] [19,20] [21] [22,23] [24]"
7,"[0,1,2,3] [4] [5,6] [7] [8] [9,10,11] [12] [13] [14,15] [16] [17,18] [19] [20,21] [22] [23] [24,25] [26] [27] [28,29] [30,31] [32,33] [34]"
8,"[0] [1,2] [3] [4] [5,6] [7,8] [9,10] [11] [12,13] [14]"
9,"[0] [1] [2] [3,4] [5] [6,7] [8]"
10,"[0] [1] [2] [3,4] [5] [6,7] [8,9] [10] [11,12] [13] [14] [15,16] [17]"
1 id expected
2 1 [0,1] [2,3] [4] [5] [6,7] [8,9] [10] [11,12]
3 2 [0,1] [2] [3,4] [5,6] [7] [8,9] [10]
4 3 [0,1,2] [3,4] [5,6] [7] [8] [9] [10] [11,12] [13] [14] [15,16] [17,18] [19] [20,21,22] [23]
5 4 [0,1] [2,3] [4] [5,6] [7] [8] [9,10] [11,12] [13] [14,15] [16,17,18,19] [20] [21,22,23] [24,25] [26,27] [28] [29] [30]
6 5 [0] [1] [2] [3] [4] [5] [6,7] [8,9] [10,11] [12] [13] [14] [15,16] [17] [18,19] [20,21] [22,23] [24] [25,26] [27,28] [29]
7 6 [0,1] [2] [3] [4] [5,6] [7,8] [9] [10,11] [12,13] [14,15] [16,17] [18] [19,20] [21] [22,23] [24]
8 7 [0,1,2,3] [4] [5,6] [7] [8] [9,10,11] [12] [13] [14,15] [16] [17,18] [19] [20,21] [22] [23] [24,25] [26] [27] [28,29] [30,31] [32,33] [34]
9 8 [0] [1,2] [3] [4] [5,6] [7,8] [9,10] [11] [12,13] [14]
10 9 [0] [1] [2] [3,4] [5] [6,7] [8]
11 10 [0] [1] [2] [3,4] [5] [6,7] [8,9] [10] [11,12] [13] [14] [15,16] [17]

3986
exp1_fenci/submission.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3986
exp1_fenci/test.csv Normal file

File diff suppressed because it is too large Load Diff

86925
exp1_fenci/train.csv Normal file

File diff suppressed because it is too large Load Diff

72
exp1_fenci/train_hmm.py Normal file
View File

@ -0,0 +1,72 @@
"""HMM 分词器BMES 标签 + Laplace 平滑,模型存为 json。"""
import csv, json, math
from collections import defaultdict
TRAIN = "train.csv"
MODEL = "hmm.json"
STATES = ("B", "M", "E", "S")
def tag_word(w):
if len(w) == 1:
return ["S"]
return ["B"] + ["M"] * (len(w) - 2) + ["E"]
def main():
init = defaultdict(float)
trans = {s: defaultdict(float) for s in STATES}
emit = {s: defaultdict(float) for s in STATES}
total_lines = 0
with open(TRAIN, encoding="utf-8") as f:
reader = csv.reader(f)
next(reader)
for row in reader:
if not row:
continue
words = [w for w in row[0].strip().split(" ") if w]
if not words:
continue
tags, chars = [], []
for w in words:
tags.extend(tag_word(w))
chars.extend(list(w))
init[tags[0]] += 1
for i, (c, t) in enumerate(zip(chars, tags)):
emit[t][c] += 1
if i > 0:
trans[tags[i - 1]][t] += 1
total_lines += 1
init_total = sum(init.values())
init_log = {s: math.log((init[s] + 1) / (init_total + len(STATES))) for s in STATES}
trans_log = {}
for s in STATES:
tot = sum(trans[s].values())
trans_log[s] = {
t: math.log((trans[s][t] + 1) / (tot + len(STATES))) for t in STATES
}
vocab = set()
for s in STATES:
vocab.update(emit[s].keys())
V = len(vocab) + 1
emit_log = {}
emit_default = {}
for s in STATES:
tot = sum(emit[s].values())
emit_log[s] = {c: math.log((emit[s][c] + 1) / (tot + V)) for c in emit[s]}
emit_default[s] = math.log(1 / (tot + V))
with open(MODEL, "w", encoding="utf-8") as f:
json.dump(
{"init": init_log, "trans": trans_log, "emit": emit_log, "emit_default": emit_default},
f, ensure_ascii=False,
)
print(f"trained on {total_lines} sentences, vocab={len(vocab)}, saved {MODEL}")
if __name__ == "__main__":
main()

20
exp1_fenci/transfer.py Normal file
View File

@ -0,0 +1,20 @@
"""
切分结果转换脚本
param:
raw_sen: 切分结果由空格隔开的字符串 自然 语言 处理
return
转换为序列的字符串"[0] [1] [2,3] [4,5] [6,7]"
"""
def transfer(raw_sen):
count = 0
tmp_list = []
for ele in raw_sen.strip().split(' '):
_tmp_list = []
for _ in range(len(ele)):
_tmp_list.append(count)
count += 1
tmp_list.append(str(_tmp_list).replace(' ', ''))
return ' '.join(tmp_list)

87
exp2_people/analyze.py Normal file
View File

@ -0,0 +1,87 @@
"""对爬到的文章分词统计 + 词云。"""
import pandas as pd
import jieba
import jieba.posseg as pseg
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib import font_manager
plt.rcParams["axes.unicode_minus"] = False
for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf",
"/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]:
import os as _os
if _os.path.exists(_fp):
font_manager.fontManager.addfont(_fp)
plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()]
break
INPUT = "articles.xlsx"
FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf"
FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"
import os
if not os.path.exists(FONT):
FONT = FONT_FALLBACK
STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们")
# 自定义词典(实验要求:添加新词)
for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕",
"高质量发展", "一带一路", "人类命运共同体", "火神山"]:
jieba.add_word(w, freq=1000)
def main():
df = pd.read_excel(INPUT)
all_text = "\n".join(df["content"].fillna("").astype(str).tolist())
print(f"total chars={len(all_text)}")
nouns, verbs, all_words = Counter(), Counter(), Counter()
for w, flag in pseg.cut(all_text):
w = w.strip()
if len(w) < 2 or w in STOP:
continue
if not any("\u4e00" <= c <= "\u9fff" for c in w):
continue
all_words[w] += 1
if flag.startswith("n"):
nouns[w] += 1
elif flag.startswith("v"):
verbs[w] += 1
print("\n=== Top20 名词 ===")
for w, c in nouns.most_common(20):
print(f" {w}\t{c}")
print("\n=== Top20 动词 ===")
for w, c in verbs.most_common(20):
print(f" {w}\t{c}")
pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False)
pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False)
# 词云(全部词)
wc = WordCloud(font_path=FONT, width=1200, height=800,
background_color="white", max_words=200)
wc.generate_from_frequencies(all_words)
wc.to_file("wordcloud_all.png")
# 名词 / 动词 分别词云
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png")
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png")
# 组合图
fig, axes = plt.subplots(1, 3, figsize=(21, 7))
for ax, path, title in zip(axes,
["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"],
["整体", "名词", "动词"]):
ax.imshow(plt.imread(path))
ax.set_title(title, fontsize=20)
ax.axis("off")
plt.tight_layout()
plt.savefig("wordclouds_combined.png", dpi=120)
print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png")
if __name__ == "__main__":
main()

BIN
exp2_people/articles.xlsx Normal file

Binary file not shown.

66
exp2_people/crawl.py Normal file
View File

@ -0,0 +1,66 @@
"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文,存到 excel。"""
import requests, time, os, re
from bs4 import BeautifulSoup
import pandas as pd
BASE = "http://jhsjk.people.cn/"
LIST_URL = BASE + "result/1?form=706&else=501"
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
OUT_XLSX = "articles.xlsx"
def get(url):
r = requests.get(url, headers=HEADERS, timeout=20)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
def extract_list():
html = get(LIST_URL)
soup = BeautifulSoup(html, "lxml")
links = []
for a in soup.find_all("a", href=True):
href = a["href"]
m = re.match(r"^article/(\d+)$", href)
if not m:
continue
title = a.get_text(strip=True)
if not title:
continue
url = BASE + href
if (url, title) not in links:
links.append((url, title))
return links
def extract_article(url):
html = get(url)
soup = BeautifulSoup(html, "lxml")
h1 = soup.find("h1")
title = h1.get_text(strip=True) if h1 else ""
body = soup.select_one(".d2txt_con") or soup.select_one("#content")
text = body.get_text("\n", strip=True) if body else ""
return title, text
def main():
links = extract_list()
print(f"found {len(links)} links")
rows = []
for i, (url, link_title) in enumerate(links, 1):
try:
t, body = extract_article(url)
title = t or link_title
print(f"[{i}/{len(links)}] {title[:40]} chars={len(body)}")
rows.append({"url": url, "title": title, "content": body})
time.sleep(0.5)
except Exception as e:
print(f"[{i}] failed: {e}")
df = pd.DataFrame(rows)
df.to_excel(OUT_XLSX, index=False)
print(f"saved {OUT_XLSX}, {len(df)} rows")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,21 @@
word,count
金融,109
人民,69
干部,68
城市,50
领导,41
政绩观,40
经济,40
中国,39
问题,33
群众,30
体系,30
讲话,29
海洋,28
特色,27
文章,25
风险,24
政绩,23
习近平,22
国家,22
主席,22
1 word count
2 金融 109
3 人民 69
4 干部 68
5 城市 50
6 领导 41
7 政绩观 40
8 经济 40
9 中国 39
10 问题 33
11 群众 30
12 体系 30
13 讲话 29
14 海洋 28
15 特色 27
16 文章 25
17 风险 24
18 政绩 23
19 习近平 22
20 国家 22
21 主席 22

View File

@ -0,0 +1,21 @@
word,count
发展,97
坚持,72
建设,57
工作,50
推进,48
推动,44
创新,33
加强,25
服务,24
树立,23
践行,20
贯彻,20
完善,20
实施,19
不能,18
学习,18
教育,17
深入,16
优化,15
考核,14
1 word count
2 发展 97
3 坚持 72
4 建设 57
5 工作 50
6 推进 48
7 推动 44
8 创新 33
9 加强 25
10 服务 24
11 树立 23
12 践行 20
13 贯彻 20
14 完善 20
15 实施 19
16 不能 18
17 学习 18
18 教育 17
19 深入 16
20 优化 15
21 考核 14

Binary file not shown.

After

Width:  |  Height:  |  Size: 321 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 211 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 213 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 842 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.