我从SpaCy文档中获取了一些代码,这些代码允许你为文本分配自定义的依赖标签,我希望用它来解释用户的意图。代码大体上是工作的,但例如当我运行代码时,它将“delete”标记为‘ROOT’,而在deps
字典中它应该被标记为‘INTENT’。
from __future__ import unicode_literals, print_functionimport placimport randomimport spacyfrom pathlib import Path# 训练数据:文本、头部和依赖标签# 对于没有关系的,我们简单地选择一个任意的依赖标签,例如 '-'TRAIN_DATA = [ ("How do I delete my account?", { 'heads': [3, 3, 3, 3, 5, 3, 3], # 标记头部的索引 'deps': ['ROOT', '-', '-', 'INTENT', '-', 'OBJECT', '-'] }), ("How do I add a balance?", { 'heads': [3, 3, 3, 3, 5, 3, 3], 'deps': ['ROOT', '-', '-', 'INTENT', '-', 'OBJECT', '-'] }), ("How do I deposit my funds into my bank account?", { 'heads': [3, 3, 3, 3, 5, 3, 3, 9, 9, 6, 3], 'deps': ['ROOT', '-', '-', 'INTENT', '-', '-', '-', '-', '-', 'OBJECT', '-'] }), ("How do I fill out feedback forms?", { 'heads': [3, 3, 3, 3, 3, 6, 3, 3], 'deps': ['ROOT', '-', '-', 'INTENT', '-', '-', 'OBJECT', '-'] }), #("How does my profile impact my score?", { #'heads': [4, 4, 4, 4, 4, 6, 4, 4], #'deps': ['ROOT', '-', '-', '-', 'INTENT', '-', 'OBJECT' '-'] #}), ("What are the fees?", { 'heads': [1, 1, 3, 1, 1], 'deps': ['ROOT', '-', '-', 'INTENT', '-'] }), ("How do I update my profile picture?", { 'heads': [3, 3, 3, 3, 6, 6, 3, 3], 'deps': ['ROOT', '-', '-', 'INTENT', '-', 'OBJECT', 'OBJECT', '-'] }), ("How do I add a referral to the marketplace?", { 'heads': [3, 3, 3, 3, 5, 3, 3, 8, 6, 3], 'deps': ['ROOT', '-', '-', 'INTENT', '-', 'OBJECT', '-', '-', 'OBJECT', '-'] }),]@plac.annotations( model=("模型名称。默认为空白的 'en' 模型。", "option", "m", str), output_dir=("可选的输出目录", "option", "o", Path), n_iter=("训练迭代次数", "option", "n", int))def main(model=None, output_dir=None, n_iter=5): """加载模型,设置管道并训练解析器。""" if model is not None: nlp = spacy.load(model) # 加载已存在的SpaCy模型 print("已加载模型 '%s'" % model) else: nlp = spacy.blank('en') # 创建空白的语言类 print("已创建空白 'en' 模型") # 我们将使用内置的依赖解析器类,但我们想要创建一个新的实例 - 以防万一。 if 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') parser = nlp.create_pipe('parser') nlp.add_pipe(parser, first=True) #向解析器添加新的标签 for text, annotations in TRAIN_DATA: for dep in annotations.get('deps', []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] with nlp.disable_pipes(*other_pipes): # 只训练解析器 optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # 测试训练后的模型 test_model(nlp) # 将模型保存到输出目录 if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("模型已保存到", output_dir) # 测试保存的模型 print("从", output_dir, "加载") nlp2 = spacy.load(output_dir) test_model(nlp2)def test_model(nlp): texts = ["How do I delete my account?"] docs = nlp.pipe(texts) for doc in docs: print(doc.text) print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])if __name__ == '__main__': plac.call(main)
这是输出结果:How do I delete my account?[(u'How', u'ROOT', u'delete'), (u'delete', u'ROOT', u'delete'), (u'account', u'OBJECT', u'delete')]
回答:
我认为你的问题根源在于依赖树的根节点自动被标记为'ROOT'
(依赖树的根节点被定义为其头部是自身的标记)。
一个可能的解决方法是为你的训练数据添加一个人工根节点:
("root How do I delete my account?", { 'heads': [0, 4, 4, 4, 0, 6, 4, 4], # 标记头部的索引 'deps': ['ROOT', '-', '-', '-', 'INTENT', '-', 'OBJECT', '-']})
(同时在你的测试示例中也添加root
符号:texts = ["root How do I delete my account?"]
)
通过这些更改,如果你对模型进行足够长的训练,你将得到:
root How do I delete my account?[('root', 'ROOT', 'root'), ('delete', 'INTENT', 'root'), ('account', 'OBJECT', 'delete')]