Initial Commit
This commit is contained in:
commit
a4ec0b90da
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
best.pt
|
||||||
|
.codex
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
[General]
|
||||||
|
logo=
|
||||||
|
name=
|
||||||
BIN
exp1_fenci/__pycache__/predict.cpython-311.pyc
Normal file
BIN
exp1_fenci/__pycache__/predict.cpython-311.pyc
Normal file
Binary file not shown.
1
exp1_fenci/bert/ckpt/labels.json
Normal file
1
exp1_fenci/bert/ckpt/labels.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
["B", "M", "E", "S"]
|
||||||
21278
exp1_fenci/bert/ckpt/tokenizer.json
Normal file
21278
exp1_fenci/bert/ckpt/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
14
exp1_fenci/bert/ckpt/tokenizer_config.json
Normal file
14
exp1_fenci/bert/ckpt/tokenizer_config.json
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"backend": "tokenizers",
|
||||||
|
"cls_token": "[CLS]",
|
||||||
|
"do_lower_case": true,
|
||||||
|
"is_local": false,
|
||||||
|
"mask_token": "[MASK]",
|
||||||
|
"model_max_length": 1000000000000000019884624838656,
|
||||||
|
"pad_token": "[PAD]",
|
||||||
|
"sep_token": "[SEP]",
|
||||||
|
"strip_accents": null,
|
||||||
|
"tokenize_chinese_chars": true,
|
||||||
|
"tokenizer_class": "BertTokenizer",
|
||||||
|
"unk_token": "[UNK]"
|
||||||
|
}
|
||||||
129
exp1_fenci/bert/predict_bert.py
Normal file
129
exp1_fenci/bert/predict_bert.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
"""用训练好的 BERT-BMES 做 test 分词。"""
|
||||||
|
import csv, torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from transformers import BertTokenizerFast, BertModel
|
||||||
|
|
||||||
|
CKPT = "ckpt/best.pt"
|
||||||
|
TOK_DIR = "ckpt"
|
||||||
|
MODEL_NAME = "hfl/chinese-bert-wwm-ext"
|
||||||
|
TEST = "../test.csv"
|
||||||
|
OUT = "../submission_bert.csv"
|
||||||
|
MAX_LEN = 510
|
||||||
|
BATCH = 32
|
||||||
|
LABELS = ["B", "M", "E", "S"]
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|
||||||
|
class BertTagger(nn.Module):
|
||||||
|
def __init__(self, name, n):
|
||||||
|
super().__init__()
|
||||||
|
self.bert = BertModel.from_pretrained(name)
|
||||||
|
self.drop = nn.Dropout(0.1)
|
||||||
|
self.cls = nn.Linear(self.bert.config.hidden_size, n)
|
||||||
|
def forward(self, ids, attn):
|
||||||
|
return self.cls(self.drop(self.bert(ids, attention_mask=attn).last_hidden_state))
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_chars(chars, max_len):
|
||||||
|
return [chars[i:i + max_len] for i in range(0, len(chars), max_len)]
|
||||||
|
|
||||||
|
|
||||||
|
def fix_tags(tags):
|
||||||
|
out = []; prev = None
|
||||||
|
for t in tags:
|
||||||
|
if prev in ("B", "M") and t in ("B", "S"):
|
||||||
|
if prev == "B": out[-1] = "S"
|
||||||
|
else: out[-1] = "E"
|
||||||
|
out.append(t); prev = t
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def tags_to_words(chars, tags):
|
||||||
|
words, buf = [], ""
|
||||||
|
for c, t in zip(chars, tags):
|
||||||
|
if t == "B":
|
||||||
|
if buf: words.append(buf)
|
||||||
|
buf = c
|
||||||
|
elif t == "M":
|
||||||
|
buf += c
|
||||||
|
elif t == "E":
|
||||||
|
buf += c; words.append(buf); buf = ""
|
||||||
|
else:
|
||||||
|
if buf: words.append(buf); buf = ""
|
||||||
|
words.append(c)
|
||||||
|
if buf: words.append(buf)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def transfer(words):
|
||||||
|
cnt = 0; out = []
|
||||||
|
for w in words:
|
||||||
|
idx = list(range(cnt, cnt + len(w)))
|
||||||
|
out.append(str(idx).replace(" ", ""))
|
||||||
|
cnt += len(w)
|
||||||
|
return " ".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
tok = BertTokenizerFast.from_pretrained(TOK_DIR)
|
||||||
|
model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
|
||||||
|
model.load_state_dict(torch.load(CKPT, map_location=device))
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
with open(TEST, encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f); next(reader)
|
||||||
|
rows = [r for r in reader if len(r) >= 2]
|
||||||
|
|
||||||
|
jobs = []
|
||||||
|
all_tags = {}
|
||||||
|
for i, (_, sent) in enumerate(rows):
|
||||||
|
chars = list(sent)
|
||||||
|
for s, sub in enumerate(chunk_chars(chars, MAX_LEN)):
|
||||||
|
jobs.append((i, s, sub))
|
||||||
|
|
||||||
|
jobs.sort(key=lambda x: len(x[2]))
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for b in range(0, len(jobs), BATCH):
|
||||||
|
batch = jobs[b:b + BATCH]
|
||||||
|
chars_list = [j[2] for j in batch]
|
||||||
|
enc = tok(chars_list, is_split_into_words=True, truncation=True,
|
||||||
|
max_length=MAX_LEN + 2, padding=True, return_tensors="pt")
|
||||||
|
ids = enc["input_ids"].to(device)
|
||||||
|
attn = enc["attention_mask"].to(device)
|
||||||
|
with torch.amp.autocast("cuda", dtype=torch.float16):
|
||||||
|
logits = model(ids, attn)
|
||||||
|
preds = logits.argmax(-1).cpu().numpy()
|
||||||
|
for k, (i, s, chars) in enumerate(batch):
|
||||||
|
wids = enc.word_ids(batch_index=k)
|
||||||
|
tags = []
|
||||||
|
for j, wid in enumerate(wids):
|
||||||
|
if wid is None: continue
|
||||||
|
if len(tags) == wid:
|
||||||
|
tags.append(LABELS[preds[k][j]])
|
||||||
|
all_tags[(i, s)] = tags
|
||||||
|
if (b // BATCH) % 20 == 0:
|
||||||
|
print(f" {b}/{len(jobs)}")
|
||||||
|
|
||||||
|
with open(OUT, "w", encoding="utf-8", newline="") as fo:
|
||||||
|
w = csv.writer(fo)
|
||||||
|
w.writerow(["ID", "expected"])
|
||||||
|
for i, (sid, sent) in enumerate(rows):
|
||||||
|
chars = list(sent)
|
||||||
|
full_tags = []
|
||||||
|
s = 0
|
||||||
|
while (i, s) in all_tags:
|
||||||
|
full_tags.extend(all_tags[(i, s)])
|
||||||
|
s += 1
|
||||||
|
if len(full_tags) != len(chars):
|
||||||
|
while len(full_tags) < len(chars):
|
||||||
|
full_tags.append("S")
|
||||||
|
full_tags = full_tags[:len(chars)]
|
||||||
|
full_tags = fix_tags(full_tags)
|
||||||
|
words = tags_to_words(chars, full_tags)
|
||||||
|
w.writerow([sid, transfer(words)])
|
||||||
|
print(f"wrote {OUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
54
exp1_fenci/bert/train.log
Normal file
54
exp1_fenci/bert/train.log
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
train=85180 val=1738
|
||||||
|
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 42105.96it/s]
|
||||||
|
[1mBertModel LOAD REPORT[0m from: hfl/chinese-bert-wwm-ext
|
||||||
|
Key | Status | |
|
||||||
|
-------------------------------------------+------------+--+-
|
||||||
|
cls.seq_relationship.bias | UNEXPECTED | |
|
||||||
|
cls.predictions.transform.LayerNorm.bias | UNEXPECTED | |
|
||||||
|
cls.predictions.bias | UNEXPECTED | |
|
||||||
|
cls.predictions.transform.dense.bias | UNEXPECTED | |
|
||||||
|
cls.predictions.transform.dense.weight | UNEXPECTED | |
|
||||||
|
cls.seq_relationship.weight | UNEXPECTED | |
|
||||||
|
cls.predictions.decoder.weight | UNEXPECTED | |
|
||||||
|
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- UNEXPECTED: can be ignored when loading from different task/architecture; not ok if you expect identical arch.
|
||||||
|
Exception in thread Thread-auto_conversion:
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 761, in hf_raise_for_status
|
||||||
|
response.raise_for_status()
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/httpx/_models.py", line 829, in raise_for_status
|
||||||
|
raise HTTPStatusError(message, request=request, response=self)
|
||||||
|
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0'
|
||||||
|
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
|
||||||
|
|
||||||
|
The above exception was the direct cause of the following exception:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
|
||||||
|
self.run()
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/threading.py", line 982, in run
|
||||||
|
self._target(*self._args, **self._kwargs)
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 117, in auto_conversion
|
||||||
|
raise e
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 96, in auto_conversion
|
||||||
|
sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 69, in get_conversion_pr_reference
|
||||||
|
pr = previous_pr(api, model_id, pr_title, token=token)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/transformers/safetensors_conversion.py", line 14, in previous_pr
|
||||||
|
for discussion in get_repo_discussions(repo_id=model_id, token=token):
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6949, in get_repo_discussions
|
||||||
|
discussions, has_next = _fetch_discussion_page(page_index=page_index)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 6938, in _fetch_discussion_page
|
||||||
|
hf_raise_for_status(resp)
|
||||||
|
File "/home/grtsinry43/.conda/envs/nlp-exp/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 849, in hf_raise_for_status
|
||||||
|
raise _format(HfHubHTTPError, message, response) from e
|
||||||
|
huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e66348-038e57a352b76902655292cb;7091e990-eb43-487d-900e-ca182e86423e)
|
||||||
|
|
||||||
|
403 Forbidden: Discussions are disabled for this repo.
|
||||||
|
Cannot access content at: https://huggingface.co/api/models/hfl/chinese-bert-wwm-ext/discussions?p=0.
|
||||||
|
Make sure your token has the correct permissions.
|
||||||
177
exp1_fenci/bert/train_bert.py
Normal file
177
exp1_fenci/bert/train_bert.py
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
"""BERT 字符级 BMES 分词训练。"""
|
||||||
|
import csv, os, random, json, time
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup
|
||||||
|
from torch.optim import AdamW
|
||||||
|
|
||||||
|
TRAIN = "../train.csv"
|
||||||
|
MODEL_NAME = "hfl/chinese-bert-wwm-ext"
|
||||||
|
SAVE_DIR = "ckpt"
|
||||||
|
MAX_LEN = 128
|
||||||
|
BATCH = 32
|
||||||
|
EPOCHS = 3
|
||||||
|
LR = 3e-5
|
||||||
|
SEED = 42
|
||||||
|
VAL_RATIO = 0.02
|
||||||
|
|
||||||
|
LABELS = ["B", "M", "E", "S"]
|
||||||
|
L2I = {l: i for i, l in enumerate(LABELS)}
|
||||||
|
PAD_ID = -100
|
||||||
|
|
||||||
|
random.seed(SEED); torch.manual_seed(SEED)
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|
||||||
|
def tag_word(w):
|
||||||
|
if len(w) == 1:
|
||||||
|
return ["S"]
|
||||||
|
return ["B"] + ["M"] * (len(w) - 2) + ["E"]
|
||||||
|
|
||||||
|
|
||||||
|
def load_pairs():
|
||||||
|
pairs = []
|
||||||
|
with open(TRAIN, encoding="utf-8") as f:
|
||||||
|
r = csv.reader(f); next(r)
|
||||||
|
for row in r:
|
||||||
|
if not row: continue
|
||||||
|
words = [w for w in row[0].strip().split(" ") if w]
|
||||||
|
if not words: continue
|
||||||
|
tags, chars = [], []
|
||||||
|
for w in words:
|
||||||
|
tags.extend(tag_word(w)); chars.extend(list(w))
|
||||||
|
pairs.append((chars, tags))
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
class SegDS(Dataset):
|
||||||
|
def __init__(self, pairs, tok):
|
||||||
|
self.pairs, self.tok = pairs, tok
|
||||||
|
def __len__(self): return len(self.pairs)
|
||||||
|
def __getitem__(self, i):
|
||||||
|
chars, tags = self.pairs[i]
|
||||||
|
chars = chars[:MAX_LEN - 2]
|
||||||
|
tags = tags[:MAX_LEN - 2]
|
||||||
|
enc = self.tok(chars, is_split_into_words=True, truncation=True, max_length=MAX_LEN,
|
||||||
|
padding="max_length", return_tensors="pt")
|
||||||
|
input_ids = enc["input_ids"].squeeze(0)
|
||||||
|
attn = enc["attention_mask"].squeeze(0)
|
||||||
|
word_ids = enc.word_ids(batch_index=0)
|
||||||
|
labels = []
|
||||||
|
for wid in word_ids:
|
||||||
|
if wid is None:
|
||||||
|
labels.append(PAD_ID)
|
||||||
|
else:
|
||||||
|
labels.append(L2I[tags[wid]])
|
||||||
|
labels = torch.tensor(labels, dtype=torch.long)
|
||||||
|
return input_ids, attn, labels
|
||||||
|
|
||||||
|
|
||||||
|
class BertTagger(nn.Module):
|
||||||
|
def __init__(self, name, n_labels):
|
||||||
|
super().__init__()
|
||||||
|
self.bert = BertModel.from_pretrained(name)
|
||||||
|
self.drop = nn.Dropout(0.1)
|
||||||
|
self.cls = nn.Linear(self.bert.config.hidden_size, n_labels)
|
||||||
|
def forward(self, ids, attn):
|
||||||
|
out = self.bert(ids, attention_mask=attn).last_hidden_state
|
||||||
|
return self.cls(self.drop(out))
|
||||||
|
|
||||||
|
|
||||||
|
def seg_f1(gold_tags, pred_tags):
|
||||||
|
def to_segs(tags):
|
||||||
|
r = []; start = None
|
||||||
|
for i, t in enumerate(tags):
|
||||||
|
if t == "B":
|
||||||
|
if start is not None: r.append((start, i - 1))
|
||||||
|
start = i
|
||||||
|
elif t == "M":
|
||||||
|
if start is None: start = i
|
||||||
|
elif t == "E":
|
||||||
|
if start is None: start = i
|
||||||
|
r.append((start, i)); start = None
|
||||||
|
elif t == "S":
|
||||||
|
if start is not None: r.append((start, i - 1))
|
||||||
|
r.append((i, i)); start = None
|
||||||
|
if start is not None: r.append((start, len(tags) - 1))
|
||||||
|
return set(r)
|
||||||
|
g, p = to_segs(gold_tags), to_segs(pred_tags)
|
||||||
|
tp = len(g & p)
|
||||||
|
if tp == 0: return 0.0
|
||||||
|
P, R = tp / len(p), tp / len(g)
|
||||||
|
return 2 * P * R / (P + R)
|
||||||
|
|
||||||
|
|
||||||
|
def run_val(model, loader):
|
||||||
|
model.eval()
|
||||||
|
f1s = []
|
||||||
|
with torch.no_grad():
|
||||||
|
for ids, attn, labels in loader:
|
||||||
|
ids, attn = ids.to(device), attn.to(device)
|
||||||
|
logits = model(ids, attn)
|
||||||
|
preds = logits.argmax(-1).cpu().numpy()
|
||||||
|
labels = labels.numpy()
|
||||||
|
for p, l in zip(preds, labels):
|
||||||
|
gold = [LABELS[x] for x in l if x != PAD_ID]
|
||||||
|
mask = l != PAD_ID
|
||||||
|
pr = [LABELS[x] for x, m in zip(p, mask) if m]
|
||||||
|
if gold:
|
||||||
|
f1s.append(seg_f1(gold, pr))
|
||||||
|
return sum(f1s) / max(len(f1s), 1)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
os.makedirs(SAVE_DIR, exist_ok=True)
|
||||||
|
tok = BertTokenizerFast.from_pretrained(MODEL_NAME)
|
||||||
|
pairs = load_pairs()
|
||||||
|
random.shuffle(pairs)
|
||||||
|
n_val = int(len(pairs) * VAL_RATIO)
|
||||||
|
val, train = pairs[:n_val], pairs[n_val:]
|
||||||
|
print(f"train={len(train)} val={len(val)}")
|
||||||
|
|
||||||
|
tr_loader = DataLoader(SegDS(train, tok), batch_size=BATCH, shuffle=True, num_workers=2, pin_memory=True)
|
||||||
|
va_loader = DataLoader(SegDS(val, tok), batch_size=BATCH * 2, num_workers=2, pin_memory=True)
|
||||||
|
|
||||||
|
model = BertTagger(MODEL_NAME, len(LABELS)).to(device)
|
||||||
|
opt = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
|
||||||
|
total_steps = len(tr_loader) * EPOCHS
|
||||||
|
sched = get_linear_schedule_with_warmup(opt, int(0.1 * total_steps), total_steps)
|
||||||
|
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
|
||||||
|
scaler = torch.amp.GradScaler("cuda")
|
||||||
|
|
||||||
|
best = 0.0
|
||||||
|
step = 0
|
||||||
|
for epoch in range(EPOCHS):
|
||||||
|
model.train()
|
||||||
|
t0 = time.time()
|
||||||
|
running = 0.0
|
||||||
|
for ids, attn, labels in tr_loader:
|
||||||
|
ids, attn, labels = ids.to(device), attn.to(device), labels.to(device)
|
||||||
|
opt.zero_grad()
|
||||||
|
with torch.amp.autocast("cuda", dtype=torch.float16):
|
||||||
|
logits = model(ids, attn)
|
||||||
|
loss = loss_fn(logits.view(-1, len(LABELS)), labels.view(-1))
|
||||||
|
scaler.scale(loss).backward()
|
||||||
|
scaler.unscale_(opt)
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||||
|
scaler.step(opt); scaler.update(); sched.step()
|
||||||
|
running += loss.item(); step += 1
|
||||||
|
if step % 200 == 0:
|
||||||
|
print(f" ep{epoch+1} step{step} loss={running/200:.4f} lr={sched.get_last_lr()[0]:.2e}")
|
||||||
|
running = 0.0
|
||||||
|
f1 = run_val(model, va_loader)
|
||||||
|
dt = time.time() - t0
|
||||||
|
print(f"[epoch {epoch+1}] val F1={f1:.4f} time={dt:.1f}s")
|
||||||
|
if f1 > best:
|
||||||
|
best = f1
|
||||||
|
torch.save(model.state_dict(), f"{SAVE_DIR}/best.pt")
|
||||||
|
tok.save_pretrained(SAVE_DIR)
|
||||||
|
with open(f"{SAVE_DIR}/labels.json", "w") as f:
|
||||||
|
json.dump(LABELS, f)
|
||||||
|
print(f" saved best -> {SAVE_DIR}/best.pt")
|
||||||
|
print(f"best val F1 = {best:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
108783
exp1_fenci/chinese_word_freq_list.txt
Normal file
108783
exp1_fenci/chinese_word_freq_list.txt
Normal file
File diff suppressed because it is too large
Load Diff
BIN
exp1_fenci/csu-ai-in-class-nlp-2026.zip
Normal file
BIN
exp1_fenci/csu-ai-in-class-nlp-2026.zip
Normal file
Binary file not shown.
1
exp1_fenci/hmm.json
Normal file
1
exp1_fenci/hmm.json
Normal file
File diff suppressed because one or more lines are too long
162
exp1_fenci/predict.py
Normal file
162
exp1_fenci/predict.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
"""用 HMM + 最大匹配词典混合的方式分词。
|
||||||
|
策略:先用高频词典最大匹配,对未匹配部分用 HMM Viterbi 分。
|
||||||
|
"""
|
||||||
|
import csv, json, sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
MODEL = "hmm.json"
|
||||||
|
TEST = "test.csv"
|
||||||
|
OUT = "submission.csv"
|
||||||
|
FREQ = "chinese_word_freq_list.txt"
|
||||||
|
|
||||||
|
STATES = ("B", "M", "E", "S")
|
||||||
|
|
||||||
|
|
||||||
|
def load_model():
|
||||||
|
with open(MODEL, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_dict():
|
||||||
|
words = set()
|
||||||
|
with open(FREQ, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
w = parts[1]
|
||||||
|
if len(w) >= 2:
|
||||||
|
words.add(w)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def viterbi(chars, model):
|
||||||
|
if not chars:
|
||||||
|
return []
|
||||||
|
init = model["init"]
|
||||||
|
trans = model["trans"]
|
||||||
|
emit = model["emit"]
|
||||||
|
emit_def = model["emit_default"]
|
||||||
|
|
||||||
|
V = [{}]
|
||||||
|
path = {}
|
||||||
|
for s in STATES:
|
||||||
|
e = emit[s].get(chars[0], emit_def[s])
|
||||||
|
V[0][s] = init[s] + e
|
||||||
|
path[s] = [s]
|
||||||
|
|
||||||
|
for t in range(1, len(chars)):
|
||||||
|
V.append({})
|
||||||
|
new_path = {}
|
||||||
|
for s in STATES:
|
||||||
|
e = emit[s].get(chars[t], emit_def[s])
|
||||||
|
best_p, best_prev = max(
|
||||||
|
(V[t - 1][p] + trans[p][s] + e, p) for p in STATES
|
||||||
|
)
|
||||||
|
# 限制:B/S 开头;E/S 结尾;B->M/E;M->M/E;E->B/S;S->B/S
|
||||||
|
V[t][s] = best_p
|
||||||
|
new_path[s] = path[best_prev] + [s]
|
||||||
|
path = new_path
|
||||||
|
|
||||||
|
# 末位必须是 E 或 S
|
||||||
|
best_p, best_s = max((V[-1][s], s) for s in ("E", "S"))
|
||||||
|
return path[best_s]
|
||||||
|
|
||||||
|
|
||||||
|
def seg_by_tags(chars, tags):
|
||||||
|
words = []
|
||||||
|
buf = ""
|
||||||
|
for c, t in zip(chars, tags):
|
||||||
|
if t == "B":
|
||||||
|
if buf:
|
||||||
|
words.append(buf)
|
||||||
|
buf = c
|
||||||
|
elif t == "M":
|
||||||
|
buf += c
|
||||||
|
elif t == "E":
|
||||||
|
buf += c
|
||||||
|
words.append(buf)
|
||||||
|
buf = ""
|
||||||
|
else: # S
|
||||||
|
if buf:
|
||||||
|
words.append(buf)
|
||||||
|
buf = ""
|
||||||
|
words.append(c)
|
||||||
|
if buf:
|
||||||
|
words.append(buf)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def max_match(chars, word_set, max_len=6):
|
||||||
|
"""正向最大匹配返回 segments 列表,每个元素是 (start, end) 左闭右开。"""
|
||||||
|
n = len(chars)
|
||||||
|
i = 0
|
||||||
|
segs = []
|
||||||
|
while i < n:
|
||||||
|
matched = False
|
||||||
|
for L in range(min(max_len, n - i), 1, -1):
|
||||||
|
w = "".join(chars[i:i + L])
|
||||||
|
if w in word_set:
|
||||||
|
segs.append((i, i + L))
|
||||||
|
i += L
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
if not matched:
|
||||||
|
i += 1
|
||||||
|
return segs
|
||||||
|
|
||||||
|
|
||||||
|
def hybrid_segment(sentence, model, word_set):
|
||||||
|
chars = list(sentence)
|
||||||
|
n = len(chars)
|
||||||
|
if n == 0:
|
||||||
|
return []
|
||||||
|
locked = max_match(chars, word_set)
|
||||||
|
# 在 locked 之外用 HMM
|
||||||
|
result = []
|
||||||
|
cur = 0
|
||||||
|
for a, b in locked:
|
||||||
|
if cur < a:
|
||||||
|
sub = chars[cur:a]
|
||||||
|
tags = viterbi(sub, model)
|
||||||
|
result.extend(seg_by_tags(sub, tags))
|
||||||
|
result.append("".join(chars[a:b]))
|
||||||
|
cur = b
|
||||||
|
if cur < n:
|
||||||
|
sub = chars[cur:]
|
||||||
|
tags = viterbi(sub, model)
|
||||||
|
result.extend(seg_by_tags(sub, tags))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def transfer(words):
|
||||||
|
"""['我','爱','自然','语言'] -> '[0] [1] [2,3] [4,5]'"""
|
||||||
|
count = 0
|
||||||
|
out = []
|
||||||
|
for w in words:
|
||||||
|
idx = list(range(count, count + len(w)))
|
||||||
|
out.append(str(idx).replace(" ", ""))
|
||||||
|
count += len(w)
|
||||||
|
return " ".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
model = load_model()
|
||||||
|
word_set = load_dict()
|
||||||
|
print(f"dict size={len(word_set)}")
|
||||||
|
|
||||||
|
with open(TEST, encoding="utf-8") as f, open(OUT, "w", encoding="utf-8", newline="") as out:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
writer = csv.writer(out)
|
||||||
|
next(reader) # id,sentence
|
||||||
|
writer.writerow(["id", "expected"])
|
||||||
|
for row in reader:
|
||||||
|
if len(row) < 2:
|
||||||
|
continue
|
||||||
|
sid, sent = row[0], row[1]
|
||||||
|
words = hybrid_segment(sent, model, word_set)
|
||||||
|
writer.writerow([sid, transfer(words)])
|
||||||
|
print(f"wrote {OUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
11
exp1_fenci/sample_submission.csv
Normal file
11
exp1_fenci/sample_submission.csv
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
id,expected
|
||||||
|
1,"[0,1] [2,3] [4] [5] [6,7] [8,9] [10] [11,12]"
|
||||||
|
2,"[0,1] [2] [3,4] [5,6] [7] [8,9] [10]"
|
||||||
|
3,"[0,1,2] [3,4] [5,6] [7] [8] [9] [10] [11,12] [13] [14] [15,16] [17,18] [19] [20,21,22] [23]"
|
||||||
|
4,"[0,1] [2,3] [4] [5,6] [7] [8] [9,10] [11,12] [13] [14,15] [16,17,18,19] [20] [21,22,23] [24,25] [26,27] [28] [29] [30]"
|
||||||
|
5,"[0] [1] [2] [3] [4] [5] [6,7] [8,9] [10,11] [12] [13] [14] [15,16] [17] [18,19] [20,21] [22,23] [24] [25,26] [27,28] [29]"
|
||||||
|
6,"[0,1] [2] [3] [4] [5,6] [7,8] [9] [10,11] [12,13] [14,15] [16,17] [18] [19,20] [21] [22,23] [24]"
|
||||||
|
7,"[0,1,2,3] [4] [5,6] [7] [8] [9,10,11] [12] [13] [14,15] [16] [17,18] [19] [20,21] [22] [23] [24,25] [26] [27] [28,29] [30,31] [32,33] [34]"
|
||||||
|
8,"[0] [1,2] [3] [4] [5,6] [7,8] [9,10] [11] [12,13] [14]"
|
||||||
|
9,"[0] [1] [2] [3,4] [5] [6,7] [8]"
|
||||||
|
10,"[0] [1] [2] [3,4] [5] [6,7] [8,9] [10] [11,12] [13] [14] [15,16] [17]"
|
||||||
|
3986
exp1_fenci/submission.csv
Normal file
3986
exp1_fenci/submission.csv
Normal file
File diff suppressed because it is too large
Load Diff
3986
exp1_fenci/submission_bert.csv
Normal file
3986
exp1_fenci/submission_bert.csv
Normal file
File diff suppressed because it is too large
Load Diff
3986
exp1_fenci/test.csv
Normal file
3986
exp1_fenci/test.csv
Normal file
File diff suppressed because it is too large
Load Diff
86925
exp1_fenci/train.csv
Normal file
86925
exp1_fenci/train.csv
Normal file
File diff suppressed because it is too large
Load Diff
72
exp1_fenci/train_hmm.py
Normal file
72
exp1_fenci/train_hmm.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
"""HMM 分词器:BMES 标签 + Laplace 平滑,模型存为 json。"""
|
||||||
|
import csv, json, math
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
TRAIN = "train.csv"
|
||||||
|
MODEL = "hmm.json"
|
||||||
|
STATES = ("B", "M", "E", "S")
|
||||||
|
|
||||||
|
|
||||||
|
def tag_word(w):
|
||||||
|
if len(w) == 1:
|
||||||
|
return ["S"]
|
||||||
|
return ["B"] + ["M"] * (len(w) - 2) + ["E"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
init = defaultdict(float)
|
||||||
|
trans = {s: defaultdict(float) for s in STATES}
|
||||||
|
emit = {s: defaultdict(float) for s in STATES}
|
||||||
|
total_lines = 0
|
||||||
|
|
||||||
|
with open(TRAIN, encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
next(reader)
|
||||||
|
for row in reader:
|
||||||
|
if not row:
|
||||||
|
continue
|
||||||
|
words = [w for w in row[0].strip().split(" ") if w]
|
||||||
|
if not words:
|
||||||
|
continue
|
||||||
|
tags, chars = [], []
|
||||||
|
for w in words:
|
||||||
|
tags.extend(tag_word(w))
|
||||||
|
chars.extend(list(w))
|
||||||
|
init[tags[0]] += 1
|
||||||
|
for i, (c, t) in enumerate(zip(chars, tags)):
|
||||||
|
emit[t][c] += 1
|
||||||
|
if i > 0:
|
||||||
|
trans[tags[i - 1]][t] += 1
|
||||||
|
total_lines += 1
|
||||||
|
|
||||||
|
init_total = sum(init.values())
|
||||||
|
init_log = {s: math.log((init[s] + 1) / (init_total + len(STATES))) for s in STATES}
|
||||||
|
|
||||||
|
trans_log = {}
|
||||||
|
for s in STATES:
|
||||||
|
tot = sum(trans[s].values())
|
||||||
|
trans_log[s] = {
|
||||||
|
t: math.log((trans[s][t] + 1) / (tot + len(STATES))) for t in STATES
|
||||||
|
}
|
||||||
|
|
||||||
|
vocab = set()
|
||||||
|
for s in STATES:
|
||||||
|
vocab.update(emit[s].keys())
|
||||||
|
V = len(vocab) + 1
|
||||||
|
emit_log = {}
|
||||||
|
emit_default = {}
|
||||||
|
for s in STATES:
|
||||||
|
tot = sum(emit[s].values())
|
||||||
|
emit_log[s] = {c: math.log((emit[s][c] + 1) / (tot + V)) for c in emit[s]}
|
||||||
|
emit_default[s] = math.log(1 / (tot + V))
|
||||||
|
|
||||||
|
with open(MODEL, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(
|
||||||
|
{"init": init_log, "trans": trans_log, "emit": emit_log, "emit_default": emit_default},
|
||||||
|
f, ensure_ascii=False,
|
||||||
|
)
|
||||||
|
print(f"trained on {total_lines} sentences, vocab={len(vocab)}, saved {MODEL}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
20
exp1_fenci/transfer.py
Normal file
20
exp1_fenci/transfer.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
"""
|
||||||
|
切分结果转换脚本。
|
||||||
|
param:
|
||||||
|
raw_sen: 切分结果,由空格隔开的字符串。“我 爱 自然 语言 处理”
|
||||||
|
|
||||||
|
return:
|
||||||
|
转换为序列的字符串。"[0] [1] [2,3] [4,5] [6,7]"
|
||||||
|
"""
|
||||||
|
|
||||||
|
def transfer(raw_sen):
|
||||||
|
count = 0
|
||||||
|
tmp_list = []
|
||||||
|
for ele in raw_sen.strip().split(' '):
|
||||||
|
_tmp_list = []
|
||||||
|
for _ in range(len(ele)):
|
||||||
|
_tmp_list.append(count)
|
||||||
|
count += 1
|
||||||
|
tmp_list.append(str(_tmp_list).replace(' ', ''))
|
||||||
|
|
||||||
|
return ' '.join(tmp_list)
|
||||||
87
exp2_people/analyze.py
Normal file
87
exp2_people/analyze.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
"""对爬到的文章分词统计 + 词云。"""
|
||||||
|
import pandas as pd
|
||||||
|
import jieba
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
from collections import Counter
|
||||||
|
from wordcloud import WordCloud
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib import font_manager
|
||||||
|
plt.rcParams["axes.unicode_minus"] = False
|
||||||
|
for _fp in ["/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf",
|
||||||
|
"/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"]:
|
||||||
|
import os as _os
|
||||||
|
if _os.path.exists(_fp):
|
||||||
|
font_manager.fontManager.addfont(_fp)
|
||||||
|
plt.rcParams["font.sans-serif"] = [font_manager.FontProperties(fname=_fp).get_name()]
|
||||||
|
break
|
||||||
|
|
||||||
|
INPUT = "articles.xlsx"
|
||||||
|
FONT = "/usr/share/fonts/adobe-source-han-sans/SourceHanSansCN-Normal.otf"
|
||||||
|
FONT_FALLBACK = "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"
|
||||||
|
import os
|
||||||
|
if not os.path.exists(FONT):
|
||||||
|
FONT = FONT_FALLBACK
|
||||||
|
|
||||||
|
STOP = set("的了和是在也及与或对从到把被让使由于这那我们他们")
|
||||||
|
|
||||||
|
# 自定义词典(实验要求:添加新词)
|
||||||
|
for w in ["习近平", "新时代", "中国式现代化", "二十大", "党中央", "共同富裕",
|
||||||
|
"高质量发展", "一带一路", "人类命运共同体", "火神山"]:
|
||||||
|
jieba.add_word(w, freq=1000)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
df = pd.read_excel(INPUT)
|
||||||
|
all_text = "\n".join(df["content"].fillna("").astype(str).tolist())
|
||||||
|
print(f"total chars={len(all_text)}")
|
||||||
|
|
||||||
|
nouns, verbs, all_words = Counter(), Counter(), Counter()
|
||||||
|
for w, flag in pseg.cut(all_text):
|
||||||
|
w = w.strip()
|
||||||
|
if len(w) < 2 or w in STOP:
|
||||||
|
continue
|
||||||
|
if not any("\u4e00" <= c <= "\u9fff" for c in w):
|
||||||
|
continue
|
||||||
|
all_words[w] += 1
|
||||||
|
if flag.startswith("n"):
|
||||||
|
nouns[w] += 1
|
||||||
|
elif flag.startswith("v"):
|
||||||
|
verbs[w] += 1
|
||||||
|
|
||||||
|
print("\n=== Top20 名词 ===")
|
||||||
|
for w, c in nouns.most_common(20):
|
||||||
|
print(f" {w}\t{c}")
|
||||||
|
print("\n=== Top20 动词 ===")
|
||||||
|
for w, c in verbs.most_common(20):
|
||||||
|
print(f" {w}\t{c}")
|
||||||
|
|
||||||
|
pd.DataFrame(nouns.most_common(20), columns=["word", "count"]).to_csv("top20_nouns.csv", index=False)
|
||||||
|
pd.DataFrame(verbs.most_common(20), columns=["word", "count"]).to_csv("top20_verbs.csv", index=False)
|
||||||
|
|
||||||
|
# 词云(全部词)
|
||||||
|
wc = WordCloud(font_path=FONT, width=1200, height=800,
|
||||||
|
background_color="white", max_words=200)
|
||||||
|
wc.generate_from_frequencies(all_words)
|
||||||
|
wc.to_file("wordcloud_all.png")
|
||||||
|
|
||||||
|
# 名词 / 动词 分别词云
|
||||||
|
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
|
||||||
|
colormap="Blues", max_words=100).generate_from_frequencies(nouns).to_file("wordcloud_nouns.png")
|
||||||
|
WordCloud(font_path=FONT, width=1000, height=700, background_color="white",
|
||||||
|
colormap="Reds", max_words=100).generate_from_frequencies(verbs).to_file("wordcloud_verbs.png")
|
||||||
|
|
||||||
|
# 组合图
|
||||||
|
fig, axes = plt.subplots(1, 3, figsize=(21, 7))
|
||||||
|
for ax, path, title in zip(axes,
|
||||||
|
["wordcloud_all.png", "wordcloud_nouns.png", "wordcloud_verbs.png"],
|
||||||
|
["整体", "名词", "动词"]):
|
||||||
|
ax.imshow(plt.imread(path))
|
||||||
|
ax.set_title(title, fontsize=20)
|
||||||
|
ax.axis("off")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("wordclouds_combined.png", dpi=120)
|
||||||
|
print("\nsaved wordcloud_all.png / wordcloud_nouns.png / wordcloud_verbs.png / wordclouds_combined.png")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
exp2_people/articles.xlsx
Normal file
BIN
exp2_people/articles.xlsx
Normal file
Binary file not shown.
66
exp2_people/crawl.py
Normal file
66
exp2_people/crawl.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
"""爬取人民网习主席讲话数据库首页链接 + 每篇标题正文,存到 excel。"""
|
||||||
|
import requests, time, os, re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
BASE = "http://jhsjk.people.cn/"
|
||||||
|
LIST_URL = BASE + "result/1?form=706&else=501"
|
||||||
|
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"}
|
||||||
|
OUT_XLSX = "articles.xlsx"
|
||||||
|
|
||||||
|
|
||||||
|
def get(url):
|
||||||
|
r = requests.get(url, headers=HEADERS, timeout=20)
|
||||||
|
r.encoding = r.apparent_encoding
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_list():
|
||||||
|
html = get(LIST_URL)
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
links = []
|
||||||
|
for a in soup.find_all("a", href=True):
|
||||||
|
href = a["href"]
|
||||||
|
m = re.match(r"^article/(\d+)$", href)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
title = a.get_text(strip=True)
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
url = BASE + href
|
||||||
|
if (url, title) not in links:
|
||||||
|
links.append((url, title))
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def extract_article(url):
|
||||||
|
html = get(url)
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
h1 = soup.find("h1")
|
||||||
|
title = h1.get_text(strip=True) if h1 else ""
|
||||||
|
body = soup.select_one(".d2txt_con") or soup.select_one("#content")
|
||||||
|
text = body.get_text("\n", strip=True) if body else ""
|
||||||
|
return title, text
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
links = extract_list()
|
||||||
|
print(f"found {len(links)} links")
|
||||||
|
rows = []
|
||||||
|
for i, (url, link_title) in enumerate(links, 1):
|
||||||
|
try:
|
||||||
|
t, body = extract_article(url)
|
||||||
|
title = t or link_title
|
||||||
|
print(f"[{i}/{len(links)}] {title[:40]} chars={len(body)}")
|
||||||
|
rows.append({"url": url, "title": title, "content": body})
|
||||||
|
time.sleep(0.5)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[{i}] failed: {e}")
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
df.to_excel(OUT_XLSX, index=False)
|
||||||
|
print(f"saved {OUT_XLSX}, {len(df)} rows")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
21
exp2_people/top20_nouns.csv
Normal file
21
exp2_people/top20_nouns.csv
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
word,count
|
||||||
|
金融,109
|
||||||
|
人民,69
|
||||||
|
干部,68
|
||||||
|
城市,50
|
||||||
|
领导,41
|
||||||
|
政绩观,40
|
||||||
|
经济,40
|
||||||
|
中国,39
|
||||||
|
问题,33
|
||||||
|
群众,30
|
||||||
|
体系,30
|
||||||
|
讲话,29
|
||||||
|
海洋,28
|
||||||
|
特色,27
|
||||||
|
文章,25
|
||||||
|
风险,24
|
||||||
|
政绩,23
|
||||||
|
习近平,22
|
||||||
|
国家,22
|
||||||
|
主席,22
|
||||||
|
21
exp2_people/top20_verbs.csv
Normal file
21
exp2_people/top20_verbs.csv
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
word,count
|
||||||
|
发展,97
|
||||||
|
坚持,72
|
||||||
|
建设,57
|
||||||
|
工作,50
|
||||||
|
推进,48
|
||||||
|
推动,44
|
||||||
|
创新,33
|
||||||
|
加强,25
|
||||||
|
服务,24
|
||||||
|
树立,23
|
||||||
|
践行,20
|
||||||
|
贯彻,20
|
||||||
|
完善,20
|
||||||
|
实施,19
|
||||||
|
不能,18
|
||||||
|
学习,18
|
||||||
|
教育,17
|
||||||
|
深入,16
|
||||||
|
优化,15
|
||||||
|
考核,14
|
||||||
|
BIN
exp2_people/wordcloud_all.png
Normal file
BIN
exp2_people/wordcloud_all.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 321 KiB |
BIN
exp2_people/wordcloud_nouns.png
Normal file
BIN
exp2_people/wordcloud_nouns.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 211 KiB |
BIN
exp2_people/wordcloud_verbs.png
Normal file
BIN
exp2_people/wordcloud_verbs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 213 KiB |
BIN
exp2_people/wordclouds_combined.png
Normal file
BIN
exp2_people/wordclouds_combined.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 842 KiB |
BIN
分词实验指导书2026.docx
Normal file
BIN
分词实验指导书2026.docx
Normal file
Binary file not shown.
BIN
基于MindSpore的命名实体识别实验手册.docx
Normal file
BIN
基于MindSpore的命名实体识别实验手册.docx
Normal file
Binary file not shown.
BIN
网络信息抽取和分析指导书 (1).docx
Normal file
BIN
网络信息抽取和分析指导书 (1).docx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user