如何解决我可以将 Python pandas 数据框用于 NLP 语料库或文档吗?
我想试试这个模型 doc_to_vec 作为我的实验
http://tutorialspoint.com/gensim/gensim_doc2vec_model.htm
我想将我的数据集转换为语料库作为训练数据集并应用 Gensim 模型。
这是我的数据集链接
https://drive.google.com/file/d/1S80I_5zkjJfeTzby7OjIqrs1vMJI6jVo/view?usp=sharing
我已经提到了这个 StackOverflow 问题,但无法工作
How to create corpus from pandas data frame to operate with NLTK
你也可以在 google colab 上查看我的代码
https://colab.research.google.com/drive/1BmBNrfsxQ0AIJH_1hfMaMAceQLh2Xk7Q?usp=sharing
import pandas as pd
dataset = pd.read_csv('ADL_Two_column_MoCo.csv',encoding = 'unicode_escape')
dataset = dataset.dropna()
import gensim
def tagged_document(list_of_list_of_words):
for i,list_of_words in enumerate(list_of_list_of_words):
yield gensim.models.doc2vec.TaggedDocument(list_of_words,[i])
data = [dataset]
data
data_for_training = list(tagged_document(data))
model = gensim.models.doc2vec.Doc2Vec(vector_size=40,min_count=2,epochs=30)
model.build_vocab(data_for_training)
model.train(data_for_training,total_examples=model.corpus_count,epochs=model.epochs)
len(data_for_training)
1
data_for_training
[TaggedDocument(words= Smile Canonical Column \
0 C1=CC=C(C=C1)C2OC(C(O2)CO)CO CHIRALPAK AD
1 C1=CC=C(C=C1)C(C(C2=CC=CC=C2)O)O CHIRALPAK AD
2 CC(C1=CC=C(C=C1)C2=CC=CC=C2)O CHIRALPAK AD
5 CC(C1=CC=CC=C1)OC(=O)C2=CC(=CC(=C2)[N+](=O)[O-... CHIRALPAK AD
6 C1=CC=C2C(=C1)C=CC(=C2C3=C(C=CC4=CC=CC=C43)O)O CHIRALPAK AD
.. ... ...
839 C1CC(=O)NC(=O)C1N2C(=O)C3=CC=CC=C3C2=O CHROMEGACHIRAL CCJ
840 CC(C1=CC=C(S1)C(=O)C2=CC=CC=C2)C(=O)O CHROMEGACHIRAL CCJ
841 CCC(COC(=O)C1=CC(=C(C(=C1)OC)OC)OC)(C2=CC=CC=C... CHROMEGACHIRAL CCJ
842 CCC(COC(=O)C1=CC(=C(C(=C1)OC)OC)OC)(C2=CC=CC=C... CHROMEGACHIRAL CCJ
843 CCC(COC(=O)C1=CC(=C(C(=C1)OC)OC)OC)(C2=CC=CC=C... CHROMEGACHIRAL CCJ
Mobile phase
0 methanol
1 n-hexane / ethanol
2 water / acetonitrile
5 methanol
6 n-hexane / 2-propanol
.. ...
839 methanol
840 n-hexane / 2-propanol / trifluoroacetic acid
841 n-heptane / 2-propanol / diethylamine
842 n-hexane / 2-propanol
843 methanol / diethylamine
[828 rows x 3 columns],tags=[0])]
这是我得到的价值。
RuntimeError Traceback (most recent call last)
<ipython-input-45-72344a512bb5> in <module>
----> 1 model.train(data_for_training,epochs=model.epochs)
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\doc2vec.py in train(self,documents,corpus_file,total_examples,total_words,epochs,start_alpha,end_alpha,word_count,queue_factor,report_delay,callbacks)
555 sentences=documents,corpus_file=corpus_file,total_examples=total_examples,total_words=total_words,556 epochs=epochs,start_alpha=start_alpha,end_alpha=end_alpha,word_count=word_count,--> 557 queue_factor=queue_factor,report_delay=report_delay,callbacks=callbacks,**kwargs)
558
559 @classmethod
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\base_any2vec.py in train(self,sentences,compute_loss,callbacks,**kwargs)
1065 total_words=total_words,epochs=epochs,1066 queue_factor=queue_factor,compute_loss=compute_loss,-> 1067 **kwargs)
1068
1069 def _get_job_params(self,cur_epoch):
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\base_any2vec.py in train(self,data_iterable,**kwargs)
533 epochs=epochs,534 total_examples=total_examples,--> 535 total_words=total_words,**kwargs)
536
537 for callback in self.callbacks:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\base_any2vec.py in _check_training_sanity(self,**kwargs)
1171
1172 if not self.wv.vocab: # should be set by `build_vocab`
-> 1173 raise RuntimeError("you must first build vocabulary before training the model")
1174 if not len(self.wv.vectors):
1175 raise RuntimeError("you must initialize vectors before training the model")
RuntimeError: you must first build vocabulary before training the model
虽然我已经制作了词汇但数据框中的问题。
解决方法
这里有一个非常好的 doc2vec 资源:https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
#1. the text must have spaces before and after the =.
#2. word tokenize the doc generating a list of tokenized docs
#3. doc2bow to vectorize the doc into a list of a list
#4. store the corpus in a dataframe column of type object.
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
df=pd.read_csv('smile.csv')
df['corpus']=np.empty
df['corpus']=df['corpus'].astype(object)
for key,row in df.iterrows():
doc=str(row['Smile Canonical'])
doc=doc.replace('=',' = ')
tokenized_docs=[word_tokenize(doc.lower())]
#print(dictionary.token2id)
dictionary=Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
#print(corpus)
df.loc[key,'corpus']=corpus
print("每个文档被转换成一个词袋,表示每个token出现的频率") 打印(df.head())
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。