我需要根据 WordEmbeddingsModel 在 sparknlp 中生成的嵌入创建一个嵌入矩阵。直到现在我有这个代码:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
# define sparknlp pipeline
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
embeddings = WordEmbeddingsModel\
.pretrained("w2v_cc_300d","sq")\
.setInputCols(["document", "token"])\
.setOutputCol("embeddings")
embeddingsFinisher = EmbeddingsFinisher()\
.setInputCols("embeddings")\
.setOutputCols("finished_embeddings")\
.setOutputAsVector(True)
pipeline = Pipeline(stages=[document, tokenizer, embeddings, embeddingsFinisher])
model = pipeline.fit(spark_train_df)
在这种情况下,模型有一个注释器 WordEmbeddingsModel 但这个注释器没有获取词汇表的 getVocab 方法。如果模型可用的属性和方法列表是,我如何检索词汇表:
dir(model)
['__abstractmethods__',
'__class__',
'__class_getitem__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__orig_bases__',
'__parameters__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__slots__',
'__str__',
'__subclasshook__',
'__weakref__',
'_abc_impl',
'_copyValues',
'_copy_params',
'_defaultParamMap',
'_dummy',
'_from_java',
'_is_protocol',
'_paramMap',
'_params',
'_randomUID',
'_resetUid',
'_resolveParam',
'_set',
'_setDefault',
'_shouldOwn',
'_testOwnParam',
'_to_java',
'_transform',
'clear',
'copy',
'explainParam',
'explainParams',
'extractParamMap',
'getOrDefault',
'getParam',
'hasDefault',
'hasParam',
'isDefined',
'isSet',
'load',
'params',
'read',
'save',
'set',
'stages',
'transform',
'uid',
'write']