Initial Commit

2026-04-29 18:34:27 +08:00 · 2026-04-29 18:34:27 +08:00 · a4ec0b90da
commit a4ec0b90da
32 changed files with 229785 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 best.pt
 .codex
--- a/HKEY_CURRENT_USER\Software\kingsoft\Office\6.0\plugins\kcopilotentrylite\aibrandcfg
+++ b/HKEY_CURRENT_USER\Software\kingsoft\Office\6.0\plugins\kcopilotentrylite\aibrandcfg
@ -0,0 +1,3 @@
 [General]
 logo=
 name=
--- a/exp1_fenci/pycache/predict.cpython-311.pyc
+++ b/exp1_fenci/pycache/predict.cpython-311.pyc
--- a/exp1_fenci/bert/ckpt/labels.json
+++ b/exp1_fenci/bert/ckpt/labels.json
@ -0,0 +1 @@
 ["B", "M", "E", "S"]
--- a/exp1_fenci/bert/ckpt/tokenizer.json
+++ b/exp1_fenci/bert/ckpt/tokenizer.json
--- a/exp1_fenci/bert/ckpt/tokenizer_config.json
+++ b/exp1_fenci/bert/ckpt/tokenizer_config.json
@ -0,0 +1,14 @@
 {
  "backend": "tokenizers",
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "is_local": false,
  "mask_token": "[MASK]",
  "model_max_length": 1000000000000000019884624838656,
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "unk_token": "[UNK]"
 }
--- a/exp1_fenci/bert/predict_bert.py
+++ b/exp1_fenci/bert/predict_bert.py
@ -0,0 +1,129 @@
 """用训练好的 BERT-BMES 做 test 分词。"""
 import csv, torch
 import torch.nn as nn
 from transformers import BertTokenizerFast, BertModel
 CKPT = "ckpt/best.pt"
 TOK_DIR = "ckpt"
 MODEL_NAME = "hfl/chinese-bert-wwm-ext"
 TEST = "../test.csv"
 OUT = "../submission_bert.csv"
 MAX_LEN = 510
 BATCH = 32
 LABELS = ["B", "M", "E", "S"]
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class BertTagger(nn.Module):
    def __init__(self, name, n):
        super().__init__()
        self.bert = BertModel.from_pretrained(name)
        self.drop = nn.Dropout(0.1)
        self.cls = nn.Linear(self.bert.config.hidden_size, n)
    def forward(self, ids, attn):
        return self.cls(self.drop(self.bert(ids, attention_mask=attn).last_hidden_state))
 def chunk_chars(chars, max_len):
    return [chars[i:i + max_len] for i in range(0, len(chars), max_len)]
 def fix_tags(tags):
    out = []; prev = None
    for t in tags:
        if prev in ("B", "M") and t in ("B", "S"):
            if prev == "B": out[-1] = "S"
            else: out[-1] = "E"
        out.append(t); prev = t
    return out
 def tags_to_words(chars, tags):
    words, buf = [], ""
    for c, t in zip(chars, tags):
        if t == "B":
            if buf: words.append(buf)
            buf = c
        elif t == "M":
            buf += c
        elif t == "E":
            buf += c; words.append(buf); buf = ""
        else:
            if buf: words.append(buf); buf = ""
            words.append(c)
    if buf: words.append(buf)
    return words
 def transfer(words):
    cnt = 0; out = []
    for w in words:
        idx = list(range(cnt, cnt + len(w)))
        out.append(str(idx).replace(" ", ""))
        cnt += len(w)
    return " ".join(out)
 def main():
    tok = BertTokenizerFast.from_pretrained(TOK_DIR)
    model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
    model.load_state_dict(torch.load(CKPT, map_location=device))
    model.eval()
    with open(TEST, encoding="utf-8") as f:
        reader = csv.reader(f); next(reader)
        rows = [r for r in reader if len(r) >= 2]
    jobs = []
    all_tags = {}
    for i, (_, sent) in enumerate(rows):
        chars = list(sent)
        for s, sub in enumerate(chunk_chars(chars, MAX_LEN)):
            jobs.append((i, s, sub))
    jobs.sort(key=lambda x: len(x[2]))
    with torch.no_grad():
        for b in range(0, len(jobs), BATCH):
            batch = jobs[b:b + BATCH]
            chars_list = [j[2] for j in batch]
            enc = tok(chars_list, is_split_into_words=True, truncation=True,
                      max_length=MAX_LEN + 2, padding=True, return_tensors="pt")
            ids = enc["input_ids"].to(device)
            attn = enc["attention_mask"].to(device)
            with torch.amp.autocast("cuda", dtype=torch.float16):
                logits = model(ids, attn)
            preds = logits.argmax(-1).cpu().numpy()
            for k, (i, s, chars) in enumerate(batch):
                wids = enc.word_ids(batch_index=k)
                tags = []
                for j, wid in enumerate(wids):
                    if wid is None: continue
                    if len(tags) == wid:
                        tags.append(LABELS[preds[k][j]])
                all_tags[(i, s)] = tags
            if (b // BATCH) % 20 == 0:
                print(f"  {b}/{len(jobs)}")
    with open(OUT, "w", encoding="utf-8", newline="") as fo:
        w = csv.writer(fo)
        w.writerow(["ID", "expected"])
        for i, (sid, sent) in enumerate(rows):
            chars = list(sent)
            full_tags = []
            s = 0
            while (i, s) in all_tags:
                full_tags.extend(all_tags[(i, s)])
                s += 1
            if len(full_tags) != len(chars):
                while len(full_tags) < len(chars):
                    full_tags.append("S")
                full_tags = full_tags[:len(chars)]
            full_tags = fix_tags(full_tags)
            words = tags_to_words(chars, full_tags)
            w.writerow([sid, transfer(words)])
    print(f"wrote {OUT}")
 if __name__ == "__main__":
    main()
--- a/exp1_fenci/bert/train.log
+++ b/exp1_fenci/bert/train.log
@ -0,0 +1,54 @@
 train=85180  val=1738
 
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 42105.96it/s]
 [1mBertModel LOAD REPORT[0m from: hfl/chinese-bert-wwm-ext
 Key                                        | Status     |  | 
 -------------------------------------------+------------+--+-
 cls.seq_relationship.bias                  | UNEXPECTED |  | 
 cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
 cls.predictions.bias                       | UNEXPECTED |  | 
 cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
 cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
 cls.seq_relationship.weight                | UNEXPECTED |  | 
 cls.predictions.decoder.weight             | UNEXPECTED |  | 
 cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
 Notes:
 - UNEXPECTED:	can be ignored when loading from different task/architecture; not ok if you expect identical arch.
 Exception in thread Thread-auto_conversion:
 Traceback (most recent call last):
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 761, in hf_raise_for_status
    response.raise_for_status()
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/httpx/_models.py", line 829, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
 httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0'
 For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
 The above exception was the direct cause of the following exception:
 Traceback (most recent call last):
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 117, in auto_conversion
    raise e
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 96, in auto_conversion
    sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 69, in get_conversion_pr_reference
    pr = previous_pr(api, model_id, pr_title, token=token)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 14, in previous_pr
    for discussion in get_repo_discussions(repo_id=model_id, token=token):
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6949, in get_repo_discussions
    discussions, has_next = _fetch_discussion_page(page_index=page_index)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6938, in _fetch_discussion_page
    hf_raise_for_status(resp)
  File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 849, in hf_raise_for_status
    raise _format(HfHubHTTPError, message, response) from e
 huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e66348-038e57a352b76902655292cb;7091e990-eb43-487d-900e-ca182e86423e)
 403 Forbidden: Discussions are disabled for this repo.
 Cannot access content at: https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0.
 Make sure your token has the correct permissions.
--- a/exp1_fenci/bert/train_bert.py
+++ b/exp1_fenci/bert/train_bert.py
@ -0,0 +1,177 @@
 """BERT 字符级 BMES 分词训练。"""
 import csv, os, random, json, time
 import torch
 import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader
 from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup
 from torch.optim import AdamW
 TRAIN = "../train.csv"
 MODEL_NAME = "hfl/chinese-bert-wwm-ext"
 SAVE_DIR = "ckpt"
 MAX_LEN = 128
 BATCH = 32
 EPOCHS = 3
 LR = 3e-5
 SEED = 42
 VAL_RATIO = 0.02
 LABELS = ["B", "M", "E", "S"]
 L2I = {l: i for i, l in enumerate(LABELS)}
 PAD_ID = -100
 random.seed(SEED); torch.manual_seed(SEED)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def tag_word(w):
    if len(w) == 1:
        return ["S"]
    return ["B"] + ["M"] * (len(w) - 2) + ["E"]
 def load_pairs():
    pairs = []
    with open(TRAIN, encoding="utf-8") as f:
        r = csv.reader(f); next(r)
        for row in r:
            if not row: continue
            words = [w for w in row[0].strip().split(" ") if w]
            if not words: continue
            tags, chars = [], []
            for w in words:
                tags.extend(tag_word(w)); chars.extend(list(w))
            pairs.append((chars, tags))
    return pairs
 class SegDS(Dataset):
    def __init__(self, pairs, tok):
        self.pairs, self.tok = pairs, tok
    def __len__(self): return len(self.pairs)
    def __getitem__(self, i):
        chars, tags = self.pairs[i]
        chars = chars[:MAX_LEN - 2]
        tags = tags[:MAX_LEN - 2]
        enc = self.tok(chars, is_split_into_words=True, truncation=True, max_length=MAX_LEN,
                       padding="max_length", return_tensors="pt")
        input_ids = enc["input_ids"].squeeze(0)
        attn = enc["attention_mask"].squeeze(0)
        word_ids = enc.word_ids(batch_index=0)
        labels = []
        for wid in word_ids:
            if wid is None:
                labels.append(PAD_ID)
            else:
                labels.append(L2I[tags[wid]])
        labels = torch.tensor(labels, dtype=torch.long)
        return input_ids, attn, labels
 class BertTagger(nn.Module):
    def __init__(self, name, n_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained(name)
        self.drop = nn.Dropout(0.1)
        self.cls = nn.Linear(self.bert.config.hidden_size, n_labels)
    def forward(self, ids, attn):
        out = self.bert(ids, attention_mask=attn).last_hidden_state
        return self.cls(self.drop(out))
 def seg_f1(gold_tags, pred_tags):
    def to_segs(tags):
        r = []; start = None
        for i, t in enumerate(tags):
            if t == "B":
                if start is not None: r.append((start, i - 1))
                start = i
            elif t == "M":
                if start is None: start = i
            elif t == "E":
                if start is None: start = i
                r.append((start, i)); start = None
            elif t == "S":
                if start is not None: r.append((start, i - 1))
                r.append((i, i)); start = None
        if start is not None: r.append((start, len(tags) - 1))
        return set(r)
    g, p = to_segs(gold_tags), to_segs(pred_tags)
    tp = len(g & p)
    if tp == 0: return 0.0
    P, R = tp / len(p), tp / len(g)
    return 2 * P * R / (P + R)
 def run_val(model, loader):
    model.eval()
    f1s = []
    with torch.no_grad():
        for ids, attn, labels in loader:
            ids, attn = ids.to(device), attn.to(device)
            logits = model(ids, attn)
            preds = logits.argmax(-1).cpu().numpy()
            labels = labels.numpy()
            for p, l in zip(preds, labels):
                gold = [LABELS[x] for x in l if x != PAD_ID]
                mask = l != PAD_ID
                pr = [LABELS[x] for x, m in zip(p, mask) if m]
                if gold:
                    f1s.append(seg_f1(gold, pr))
    return sum(f1s) / max(len(f1s), 1)
 def main():
    os.makedirs(SAVE_DIR, exist_ok=True)
    tok = BertTokenizerFast.from_pretrained(MODEL_NAME)
    pairs = load_pairs()
    random.shuffle(pairs)
    n_val = int(len(pairs) * VAL_RATIO)
    val, train = pairs[:n_val], pairs[n_val:]
    print(f"train={len(train)}  val={len(val)}")
    tr_loader = DataLoader(SegDS(train, tok), batch_size=BATCH, shuffle=True, num_workers=2, pin_memory=True)
    va_loader = DataLoader(SegDS(val, tok), batch_size=BATCH * 2, num_workers=2, pin_memory=True)
    model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
    opt = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
    total_steps = len(tr_loader) * EPOCHS
    sched = get_linear_schedule_with_warmup(opt, int(0.1 * total_steps), total_steps)
    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
    scaler = torch.amp.GradScaler("cuda")
    best = 0.0
    step = 0
    for epoch in range(EPOCHS):
        model.train()
        t0 = time.time()
        running = 0.0
        for ids, attn, labels in tr_loader:
            ids, attn, labels = ids.to(device), attn.to(device), labels.to(device)
            opt.zero_grad()
            with torch.amp.autocast("cuda", dtype=torch.float16):
                logits = model(ids, attn)
                loss = loss_fn(logits.view(-1, len(LABELS)), labels.view(-1))
            scaler.scale(loss).backward()
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update(); sched.step()
            running += loss.item(); step += 1
            if step % 200 == 0:
                print(f"  ep{epoch+1} step{step} loss={running/200:.4f} lr={sched.get_last_lr()[0]:.2e}")
                running = 0.0
        f1 = run_val(model, va_loader)
        dt = time.time() - t0
        print(f"[epoch {epoch+1}] val F1={f1:.4f}  time={dt:.1f}s")
        if f1 > best:
            best = f1
            torch.save(model.state_dict(), f"{SAVE_DIR}/best.pt")
            tok.save_pretrained(SAVE_DIR)
            with open(f"{SAVE_DIR}/labels.json", "w") as f:
                json.dump(LABELS, f)
            print(f"  saved best -> {SAVE_DIR}/best.pt")
    print(f"best val F1 = {best:.4f}")
 if __name__ == "__main__":
    main()
--- a/exp1_fenci/chinese_word_freq_list.txt
+++ b/exp1_fenci/chinese_word_freq_list.txt
--- a/exp1_fenci/csu-ai-in-class-nlp-2026.zip
+++ b/exp1_fenci/csu-ai-in-class-nlp-2026.zip
--- a/exp1_fenci/hmm.json
+++ b/exp1_fenci/hmm.json
--- a/exp1_fenci/predict.py
+++ b/exp1_fenci/predict.py
@ -0,0 +1,162 @@
 """用 HMM + 最大匹配词典混合的方式分词。
 策略：先用高频词典最大匹配，对未匹配部分用 HMM Viterbi 分。
 """
 import csv, json, sys
 from collections import defaultdict
 MODEL = "hmm.json"
 TEST = "test.csv"
 OUT = "submission.csv"
 FREQ = "chinese_word_freq_list.txt"
 STATES = ("B", "M", "E", "S")
 def load_model():
    with open(MODEL, encoding="utf-8") as f:
        return json.load(f)
 def load_dict():
    words = set()
    with open(FREQ, encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                w = parts[1]
                if len(w) >= 2:
                    words.add(w)
    return words
 def viterbi(chars, model):
    if not chars:
        return []
    init = model["init"]
    trans = model["trans"]
    emit = model["emit"]
    emit_def = model["emit_default"]
    V = [{}]
    path = {}
    for s in STATES:
        e = emit[s].get(chars[0], emit_def[s])
        V[0][s] = init[s] + e
        path[s] = [s]
    for t in range(1, len(chars)):
        V.append({})
        new_path = {}
        for s in STATES:
            e = emit[s].get(chars[t], emit_def[s])
            best_p, best_prev = max(
                (V[t - 1][p] + trans[p][s] + e, p) for p in STATES
            )
            # 限制：B/S 开头；E/S 结尾；B->M/E；M->M/E；E->B/S；S->B/S
            V[t][s] = best_p
            new_path[s] = path[best_prev] + [s]
        path = new_path
    # 末位必须是 E 或 S
    best_p, best_s = max((V[-1][s], s) for s in ("E", "S"))
    return path[best_s]
 def seg_by_tags(chars, tags):
    words = []
    buf = ""
    for c, t in zip(chars, tags):
        if t == "B":
            if buf:
                words.append(buf)
            buf = c
        elif t == "M":
            buf += c
        elif t == "E":
            buf += c
            words.append(buf)
            buf = ""
        else:  # S
            if buf:
                words.append(buf)
                buf = ""
            words.append(c)
    if buf:
        words.append(buf)
    return words
 def max_match(chars, word_set, max_len=6):
    """正向最大匹配返回 segments 列表，每个元素是 (start, end) 左闭右开。"""
    n = len(chars)
    i = 0
    segs = []
    while i < n:
        matched = False
        for L in range(min(max_len, n - i), 1, -1):
            w = "".join(chars[i:i + L])
            if w in word_set:
                segs.append((i, i + L))
                i += L
                matched = True
                break
        if not matched:
            i += 1
    return segs
 def hybrid_segment(sentence, model, word_set):
    chars = list(sentence)
    n = len(chars)
    if n == 0:
        return []
    locked = max_match(chars, word_set)
    # 在 locked 之外用 HMM
    result = []
    cur = 0
    for a, b in locked:
        if cur < a:
            sub = chars[cur:a]
            tags = viterbi(sub, model)
            result.extend(seg_by_tags(sub, tags))
        result.append("".join(chars[a:b]))
        cur = b
    if cur < n:
        sub = chars[cur:]
        tags = viterbi(sub, model)
        result.extend(seg_by_tags(sub, tags))
    return result
 def transfer(words):
    """['我','爱','自然','语言'] -> '[0] [1] [2,3] [4,5]'"""
    count = 0
    out = []
    for w in words:
        idx = list(range(count, count + len(w)))
        out.append(str(idx).replace(" ", ""))
        count += len(w)
    return " ".join(out)
 def main():
    model = load_model()
    word_set = load_dict()
    print(f"dict size={len(word_set)}")
    with open(TEST, encoding="utf-8") as f, open(OUT, "w", encoding="utf-8", newline="") as out:
        reader = csv.reader(f)
        writer = csv.writer(out)
        next(reader)  # id,sentence
        writer.writerow(["id", "expected"])
        for row in reader:
            if len(row) < 2:
                continue
            sid, sent = row[0], row[1]
            words = hybrid_segment(sent, model, word_set)
            writer.writerow([sid, transfer(words)])
    print(f"wrote {OUT}")
 if __name__ == "__main__":
    main()
--- a/exp1_fenci/sample_submission.csv
+++ b/exp1_fenci/sample_submission.csv
@ -0,0 +1,11 @@
 id,expected
 1,"[0,1] [2,3] [4] [5] [6,7] [8,9] [10] [11,12]"
 2,"[0,1] [2] [3,4] [5,6] [7] [8,9] [10]"
 3,"[0,1,2] [3,4] [5,6] [7] [8] [9] [10] [11,12] [13] [14] [15,16] [17,18] [19] [20,21,22] [23]"
 4,"[0,1] [2,3] [4] [5,6] [7] [8] [9,10] [11,12] [13] [14,15] [16,17,18,19] [20] [21,22,23] [24,25] [26,27] [28] [29] [30]"
 5,"[0] [1] [2] [3] [4] [5] [6,7] [8,9] [10,11] [12] [13] [14] [15,16] [17] [18,19] [20,21] [22,23] [24] [25,26] [27,28] [29]"
 6,"[0,1] [2] [3] [4] [5,6] [7,8] [9] [10,11] [12,13] [14,15] [16,17] [18] [19,20] [21] [22,23] [24]"
 7,"[0,1,2,3] [4] [5,6] [7] [8] [9,10,11] [12] [13] [14,15] [16] [17,18] [19] [20,21] [22] [23] [24,25] [26] [27] [28,29] [30,31] [32,33] [34]"
 8,"[0] [1,2] [3] [4] [5,6] [7,8] [9,10] [11] [12,13] [14]"
 9,"[0] [1] [2] [3,4] [5] [6,7] [8]"
 10,"[0] [1] [2] [3,4] [5] [6,7] [8,9] [10] [11,12] [13] [14] [15,16] [17]"
--- a/exp1_fenci/submission.csv
+++ b/exp1_fenci/submission.csv
--- a/exp1_fenci/submission_bert.csv
+++ b/exp1_fenci/submission_bert.csv
--- a/exp1_fenci/test.csv
+++ b/exp1_fenci/test.csv
--- a/exp1_fenci/train.csv
+++ b/exp1_fenci/train.csv
--- a/exp1_fenci/train_hmm.py
+++ b/exp1_fenci/train_hmm.py
@ -0,0 +1,72 @@
 """HMM 分词器：BMES 标签 + Laplace 平滑，模型存为 json。"""
 import csv, json, math
 from collections import defaultdict
 TRAIN = "train.csv"
 MODEL = "hmm.json"
 STATES = ("B", "M", "E", "S")
 def tag_word(w):
    if len(w) == 1:
        return ["S"]
    return ["B"] + ["M"] * (len(w) - 2) + ["E"]
 def main():
    init = defaultdict(float)
    trans = {s: defaultdict(float) for s in STATES}
    emit = {s: defaultdict(float) for s in STATES}
    total_lines = 0
    with open(TRAIN, encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            if not row:
                continue
            words = [w for w in row[0].strip().split(" ") if w]
            if not words:
                continue
            tags, chars = [], []
            for w in words:
                tags.extend(tag_word(w))
                chars.extend(list(w))
            init[tags[0]] += 1
            for i, (c, t) in enumerate(zip(chars, tags)):
                emit[t][c] += 1
                if i > 0:
                    trans[tags[i - 1]][t] += 1
            total_lines += 1
    init_total = sum(init.values())
    init_log = {s: math.log((init[s] + 1) / (init_total + len(STATES))) for s in STATES}
    trans_log = {}
    for s in STATES:
        tot = sum(trans[s].values())
        trans_log[s] = {
            t: math.log((trans[s][t] + 1) / (tot + len(STATES))) for t in STATES
        }
    vocab = set()
    for s in STATES:
        vocab.update(emit[s].keys())
    V = len(vocab) + 1
    emit_log = {}
    emit_default = {}
    for s in STATES:
        tot = sum(emit[s].values())
        emit_log[s] = {c: math.log((emit[s][c] + 1) / (tot + V)) for c in emit[s]}
        emit_default[s] = math.log(1 / (tot + V))
    with open(MODEL, "w", encoding="utf-8") as f:
        json.dump(
            {"init": init_log, "trans": trans_log, "emit": emit_log, "emit_default": emit_default},
            f, ensure_ascii=False,
        )
    print(f"trained on {total_lines} sentences, vocab={len(vocab)}, saved {MODEL}")
 if __name__ == "__main__":
    main()
--- a/exp1_fenci/transfer.py
+++ b/exp1_fenci/transfer.py
@ -0,0 +1,20 @@
 """
    切分结果转换脚本。
    param:
        raw_sen: 切分结果，由空格隔开的字符串。“我 爱 自然 语言 处理”
    return：
        转换为序列的字符串。"[0] [1] [2,3] [4,5] [6,7]"
 """
 def transfer(raw_sen):
    count = 0
    tmp_list = []
    for ele in raw_sen.strip().split(' '):
        _tmp_list = []
        for _ in range(len(ele)):
            _tmp_list.append(count)
            count += 1
        tmp_list.append(str(_tmp_list).replace(' ', ''))
    return ' '.join(tmp_list)
--- a/exp2_people/analyze.py
+++ b/exp2_people/analyze.py
@ -0,0 +1,87 @@
 """对爬到的文章分词统计 + 词云。"""
 import pandas as pd
 import jieba
 import jieba.posseg as pseg
 from collections import Counter
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from matplotlib import font_manager
 plt.rcParams["axes.unicode_minus"] = False
 for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf",
            "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]:
    import os as _os
    if _os.path.exists(_fp):
        font_manager.fontManager.addfont(_fp)
        plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()]
        break
 INPUT = "articles.xlsx"
 FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf"
 FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"
 import os
 if not os.path.exists(FONT):
    FONT = FONT_FALLBACK
 STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们")
 # 自定义词典（实验要求：添加新词）
 for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕",
          "高质量发展", "一带一路", "人类命运共同体", "火神山"]:
    jieba.add_word(w, freq=1000)
 def main():
    df = pd.read_excel(INPUT)
    all_text = "\n".join(df["content"].fillna("").astype(str).tolist())
    print(f"total chars={len(all_text)}")
    nouns, verbs, all_words = Counter(), Counter(), Counter()
    for w, flag in pseg.cut(all_text):
        w = w.strip()
        if len(w) < 2 or w in STOP:
            continue
        if not any("\u4e00" <= c <= "\u9fff" for c in w):
            continue
        all_words[w] += 1
        if flag.startswith("n"):
            nouns[w] += 1
        elif flag.startswith("v"):
            verbs[w] += 1
    print("\n=== Top20 名词 ===")
    for w, c in nouns.most_common(20):
        print(f"  {w}\t{c}")
    print("\n=== Top20 动词 ===")
    for w, c in verbs.most_common(20):
        print(f"  {w}\t{c}")
    pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False)
    pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False)
    # 词云（全部词）
    wc = WordCloud(font_path=FONT, width=1200, height=800,
                   background_color="white", max_words=200)
    wc.generate_from_frequencies(all_words)
    wc.to_file("wordcloud_all.png")
    # 名词 / 动词 分别词云
    WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
              colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png")
    WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
              colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png")
    # 组合图
    fig, axes = plt.subplots(1, 3, figsize=(21, 7))
    for ax, path, title in zip(axes,
                               ["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"],
                               ["整体", "名词", "动词"]):
        ax.imshow(plt.imread(path))
        ax.set_title(title, fontsize=20)
        ax.axis("off")
    plt.tight_layout()
    plt.savefig("wordclouds_combined.png", dpi=120)
    print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png")
 if __name__ == "__main__":
    main()
--- a/exp2_people/articles.xlsx
+++ b/exp2_people/articles.xlsx
--- a/exp2_people/crawl.py
+++ b/exp2_people/crawl.py
@ -0,0 +1,66 @@
 """爬取人民网习主席讲话数据库首页链接 + 每篇标题正文，存到 excel。"""
 import requests, time, os, re
 from bs4 import BeautifulSoup
 import pandas as pd
 BASE = "http://jhsjk.people.cn/"
 LIST_URL = BASE + "result/1?form=706&else=501"
 HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
 OUT_XLSX = "articles.xlsx"
 def get(url):
    r = requests.get(url, headers=HEADERS, timeout=20)
    r.encoding = r.apparent_encoding
    r.raise_for_status()
    return r.text
 def extract_list():
    html = get(LIST_URL)
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        m = re.match(r"^article/(\d+)$", href)
        if not m:
            continue
        title = a.get_text(strip=True)
        if not title:
            continue
        url = BASE + href
        if (url, title) not in links:
            links.append((url, title))
    return links
 def extract_article(url):
    html = get(url)
    soup = BeautifulSoup(html, "lxml")
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else ""
    body = soup.select_one(".d2txt_con") or soup.select_one("#content")
    text = body.get_text("\n", strip=True) if body else ""
    return title, text
 def main():
    links = extract_list()
    print(f"found {len(links)} links")
    rows = []
    for i, (url, link_title) in enumerate(links, 1):
        try:
            t, body = extract_article(url)
            title = t or link_title
            print(f"[{i}/{len(links)}] {title[:40]}  chars={len(body)}")
            rows.append({"url": url, "title": title, "content": body})
            time.sleep(0.5)
        except Exception as e:
            print(f"[{i}] failed: {e}")
    df = pd.DataFrame(rows)
    df.to_excel(OUT_XLSX, index=False)
    print(f"saved {OUT_XLSX}, {len(df)} rows")
 if __name__ == "__main__":
    main()
--- a/exp2_people/top20_nouns.csv
+++ b/exp2_people/top20_nouns.csv
@ -0,0 +1,21 @@
 word,count
 金融,109
 人民,69
 干部,68
 城市,50
 领导,41
 政绩观,40
 经济,40
 中国,39
 问题,33
 群众,30
 体系,30
 讲话,29
 海洋,28
 特色,27
 文章,25
 风险,24
 政绩,23
 习近平,22
 国家,22
 主席,22
--- a/exp2_people/top20_verbs.csv
+++ b/exp2_people/top20_verbs.csv
@ -0,0 +1,21 @@
 word,count
 发展,97
 坚持,72
 建设,57
 工作,50
 推进,48
 推动,44
 创新,33
 加强,25
 服务,24
 树立,23
 践行,20
 贯彻,20
 完善,20
 实施,19
 不能,18
 学习,18
 教育,17
 深入,16
 优化,15
 考核,14
--- a/exp2_people/wordcloud_all.png
+++ b/exp2_people/wordcloud_all.png
--- a/exp2_people/wordcloud_nouns.png
+++ b/exp2_people/wordcloud_nouns.png
--- a/exp2_people/wordcloud_verbs.png
+++ b/exp2_people/wordcloud_verbs.png
--- a/exp2_people/wordclouds_combined.png
+++ b/exp2_people/wordclouds_combined.png
--- a/分词实验指导书2026.docx
+++ b/分词实验指导书2026.docx
--- a/基于MindSpore的命名实体识别实验手册.docx
+++ b/基于MindSpore的命名实体识别实验手册.docx
--- a/(1).docx
+++ b/(1).docx