Python
import os
import sys
import re
import spacy
import json
from i18n import *
# pip install spacy
# python -m spacy download en
nlp = spacy.load("en_core_web_sm")
class Word:
def __init__(self,token) -> None:
self.token = token
token = None
def getPos(self):
return self.token.pos_
def toObj(self):
ret = {
"content" : str(self.token),
"pos1" : self.token.pos_,
"pos2" : self.getPos(),
"depen" : dependency_labels_i18n(self.token.dep_,1)
}
if self.token.lemma_ != str(self.token):
ret["lemma"] = self.token.lemma_ # 词性还原
return ret
astr = '''
this is a dog
'''
# str = str.replace("\n"," ")
astr = astr.strip()
astr = re.sub(r"(\s){2,}"," ",astr)
document = nlp(astr)
# 拆分句子,将每个单词送去分析
data = []
tmp_data = []
for token in document:
if token.is_sent_start:
if len(tmp_data)!=0:
data.append(tmp_data)
tmp_data = []
tmp_data.append(Word(token).toObj())
if len(tmp_data)!=0:
data.append(tmp_data)
tmp_data = []
# 准备返回的数据
sententce = [str(s) for s in list(document.sents) ]
j = json.dumps({
"sents" : sententce,
"count" : len(sententce),
"data" : data
},ensure_ascii=False)
print(j)
# data.append({token:token.pos_ for token in sent})
# sentences.append(data)
# for i in sentences:
# print(i)
#tags
#all_tags = {w.pos: w.pos_ for w in document}
#for word in list(document.sents)[0]:
# print(" %s -- %s " % (word,word.tag_))
# 'DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'PUNCT']
# 对应于中文是 【冠词,形容词,名词,动词,冠词,形容词,形容词,名词,标点】
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
Python
dependency_labels = {
"ACL" : ["Clausalmodifierofnoun","名词的从句修饰语"],
"ACOMP" : ["Adjectivalcomplement","形容词补语"],
"ADVCL" : ["Adverbialclausemodifier","状语从句修饰语"],
"ADVMOD" : ["Adverbialmodifier","状语修饰语"],
"AGENT" : ["Agent","代理"],
"AMOD" : ["Adjectivalmodifier","形容词修饰语"],
"APPOS" : ["Appositionalmodifier","同位修饰语"],
"ATTR" : ["Attribute","属性"],
"AUX" : ["Auxiliary","辅助的"],
"AUXPASS" : ["Auxiliary(passive)","辅助(被动)"],
"CASE" : ["Casemarker","格位标记"],
"CC" : ["Coordinatingconjunction","并列连词"],
"CCOMP" : ["Clausalcomplement","从句补语"],
"COMPOUND" : ["Compoundmodifier","复合名词修饰"],
"CONJ" : ["Conjunct","连词"],
"CSUBJ" : ["Clausalsubject","主语"],
"CSUBJPASS" : ["Clausalsubject(passive)","子句主语(被动)"],
"DATIVE" : ["Dative","与格"],
"DEP" : ["Unclassifieddependent","未分类从属"],
"DET" : ["Determiner","限定词"],
"DOBJ" : ["DirectObject","直接宾语"],
"EXPL" : ["Expletive","虚词"],
"INTJ" : ["Interjection","感叹词"],
"MARK" : ["Marker","标记"],
"META" : ["Metamodifier","元修饰符"],
"NEG" : ["Negationmodifier","否定修饰符"],
"NOUNMOD" : ["Modifierofnominal","名词修饰语"],
"NPMOD" : ["Nounphraseasadverbialmodifier","名词短语状语修饰语"],
"NSUBJ" : ["Nominalsubject","名词性主语"],
"NSUBJPASS" : ["Nominalsubject(passive)","名义主语(被动)"],
"NUMMOD" : ["Numbermodifier","数字修饰符"],
"OPRD" : ["Objectpredicate","对象谓词"],
"PARATAXIS" : ["Parataxis","并列"],
"PCOMP" : ["Complementofpreposition","介词补语"],
"POBJ" : ["Objectofpreposition","介词宾语"],
"POSS" : ["Possessionmodifier","占有修饰符"],
"PRECONJ" : ["Pre-correlativeconjunction","pre-关联连词"],
"PREDET" : ["Pre-determiner","前位限定词"],
"PREP" : ["Prepositionalmodifier","介词修饰语"],
"PRT" : ["Particle","小品词"],
"PUNCT" : ["Punctuation","标点"],
"QUANTMOD" : ["Modifierofquantifier","量词修饰语"],
"RELCL" : ["Relativeclausemodifier","关系从句修饰符"],
"ROOT" : ["Root","根节点"],
"XCOMP" : ["Openclausalcomplement","开句补语"]
}
def dependency_labels_i18n(original_str,index = 0):
newstr = str.upper(original_str)
if newstr in dependency_labels:
ret = dependency_labels[newstr]
if len(ret) > index:
return ret[index]
else:
return ret[0]
else:
return original_str