편집 기록

프로필 최준호님의 편집

날짜2020.06.22

파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.

mecab

리뷰분석

nlp

sklearn

import os 
import pandas as pd
def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        print(docs)

if __name__=='__main__':
    main()

현재구글 스토어 리뷰를 크롤링으로해 csv 파일 하나를 만들었고 csv파일을 데이터프레임은 위와 같은 코드로 성공했습니다. 이후 이제 전체 문서 집합에서 2번 이상 등장한 단어들을 대상으로 문서-단어 행렬을 구축을 위해 다음 코드를 실행하였습니다.

import MeCab
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt

def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    word_tag = [w for w in parsed.split("\n")]
    pos = []
    tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
    for word_ in word_tag[:-2]:
        word = word_.split("\t")
        tag = word[1].split(",")
        if(len(word[0]) < 2):
            continue
        if(tag[-1] != '*'):
            t = tag[-1].split('/')
            if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                pos.append(t[0])
            else:
                if(tag[0] in tags):
                    pos.append(word[0])
        return pos

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
    docs = pd.concat(reviews, ignore_index=True)
    tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
    dtm = tf_vect.fit_transform(docs['내용'])

if __name__ == '__main__':
    main()

이때

Traceback (most recent call last):
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
    main()
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
    dtm = tf_vect.fit_transform(docs['내용'])
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
    self.fixed_vocabulary_)
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
    for feature in analyze(doc):
TypeError: 'NoneType' object is not iterable

Process finished with exit code 1

이와 같은 오류가 발생하는데 어떻게하면 될까요?

프로필 nowp님의 편집

날짜2020.06.22

파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.

mecab

리뷰분석

nlp

sklearn

import os 
import pandas as pd
def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        print(docs)

if __name__=='__main__':
    main()

import MeCab
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt

def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    word_tag = [w for w in parsed.split("\n")]
    pos = []
    tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
    for word_ in word_tag[:-2]:
        word = word_.split("\t")
        tag = word[1].split(",")
        if(len(word[0]) < 2):
            continue
        if(tag[-1] != '*'):
            t = tag[-1].split('/')
            if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                pos.append(t[0])
            else:
                if(tag[0] in tags):
                    pos.append(word[0])
        return pos

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
        dtm = tf_vect.fit_transform(docs['내용'])

이때

Traceback (most recent call last):
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
    main()
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
    dtm = tf_vect.fit_transform(docs['내용'])
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
    self.fixed_vocabulary_)
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
    for feature in analyze(doc):
TypeError: 'NoneType' object is not iterable

Process finished with exit code 1

이와 같은 오류가 발생하는데 어떻게하면 될까요?

프로필 최준호님의 편집

날짜2020.06.22

파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.

mecab

리뷰분석

import os 
import pandas as pd
def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        print(docs)

if __name__=='__main__':
    main()

import MeCab
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt

def searchFiles(path):
    filelist = []
    filenames = os.listdir(path)
    for filename in filenames:
        file_path = os.path.join(path, filename)
        filelist.append(file_path)
    return filelist

def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    word_tag = [w for w in parsed.split("\n")]
    pos = []
    tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
    for word_ in word_tag[:-2]:
        word = word_.split("\t")
        tag = word[1].split(",")
        if(len(word[0]) < 2):
            continue
        if(tag[-1] != '*'):
            t = tag[-1].split('/')
            if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                pos.append(t[0])
            else:
                if(tag[0] in tags):
                    pos.append(word[0])
        return pos

def main():
    reviews = []
    for filePath in searchFiles('./Reviews/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)
    docs = pd.concat(reviews, ignore_index=True)
    tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
    dtm = tf_vect.fit_transform(docs['내용'])
if __name__ == '__main__':
    main()

이때

Traceback (most recent call last):
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
    main()
  File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
    dtm = tf_vect.fit_transform(docs['내용'])
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
    self.fixed_vocabulary_)
  File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
    for feature in analyze(doc):
TypeError: 'NoneType' object is not iterable

Process finished with exit code 1

이와 같은 오류가 발생하는데 어떻게하면 될까요?