import os
import pandas as pd
def searchFiles(path):
filelist = []
filenames = os.listdir(path)
for filename in filenames:
file_path = os.path.join(path, filename)
filelist.append(file_path)
return filelist
def main():
reviews = []
for filePath in searchFiles('./Reviews/'):
review = pd.read_csv(filePath, encoding = 'utf-8')
reviews.append(review)
docs = pd.concat(reviews, ignore_index=True)
print(docs)
if __name__=='__main__':
main()
현재구글 스토어 리뷰를 크롤링으로해 csv 파일 하나를 만들었고 csv파일을 데이터프레임은 위와 같은 코드로 성공했습니다. 이후 이제 전체 문서 집합에서 2번 이상 등장한 단어들을 대상으로 문서-단어 행렬을 구축을 위해 다음 코드를 실행하였습니다.
import MeCab
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
def searchFiles(path):
filelist = []
filenames = os.listdir(path)
for filename in filenames:
file_path = os.path.join(path, filename)
filelist.append(file_path)
return filelist
def getNVM_lemma(text):
tokenizer = MeCab.Tagger()
parsed = tokenizer.parse(text)
word_tag = [w for w in parsed.split("\n")]
pos = []
tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
for word_ in word_tag[:-2]:
word = word_.split("\t")
tag = word[1].split(",")
if(len(word[0]) < 2):
continue
if(tag[-1] != '*'):
t = tag[-1].split('/')
if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
pos.append(t[0])
else:
if(tag[0] in tags):
pos.append(word[0])
return pos
def main():
reviews = []
for filePath in searchFiles('./Reviews/'):
review = pd.read_csv(filePath, encoding = 'utf-8')
reviews.append(review)
docs = pd.concat(reviews, ignore_index=True)
tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
dtm = tf_vect.fit_transform(docs['내용'])
if __name__ == '__main__':
main()
이때
Traceback (most recent call last):
File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
main()
File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
dtm = tf_vect.fit_transform(docs['내용'])
File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
for feature in analyze(doc):
TypeError: 'NoneType' object is not iterable
Process finished with exit code 1
이와 같은 오류가 발생하는데 어떻게하면 될까요?