편집 기록

편집 기록
  • 프로필 최준호님의 편집
    날짜2020.06.22

    파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.


    import os 
    import pandas as pd
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
            docs = pd.concat(reviews, ignore_index=True)
            print(docs)
    
    if __name__=='__main__':
        main()
    

    현재구글 스토어 리뷰를 크롤링으로해 csv 파일 하나를 만들었고 csv파일을 데이터프레임은 위와 같은 코드로 성공했습니다. 이후 이제 전체 문서 집합에서 2번 이상 등장한 단어들을 대상으로 문서-단어 행렬을 구축을 위해 다음 코드를 실행하였습니다.

    import MeCab
    import os
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    import matplotlib.pyplot as plt
    
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def getNVM_lemma(text):
        tokenizer = MeCab.Tagger()
        parsed = tokenizer.parse(text)
        word_tag = [w for w in parsed.split("\n")]
        pos = []
        tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
        for word_ in word_tag[:-2]:
            word = word_.split("\t")
            tag = word[1].split(",")
            if(len(word[0]) < 2):
                continue
            if(tag[-1] != '*'):
                t = tag[-1].split('/')
                if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                    pos.append(t[0])
                else:
                    if(tag[0] in tags):
                        pos.append(word[0])
            return pos
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
        dtm = tf_vect.fit_transform(docs['내용'])
    
    if __name__ == '__main__':
        main()
    

    이때

    Traceback (most recent call last):
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
        main()
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
        dtm = tf_vect.fit_transform(docs['내용'])
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
        self.fixed_vocabulary_)
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
        for feature in analyze(doc):
    TypeError: 'NoneType' object is not iterable
    
    Process finished with exit code 1
    

    이와 같은 오류가 발생하는데 어떻게하면 될까요?

  • 프로필 nowp님의 편집
    날짜2020.06.22

    파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.


    import os 
    import pandas as pd
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
            docs = pd.concat(reviews, ignore_index=True)
            print(docs)
    
    if __name__=='__main__':
        main()
    

    현재구글 스토어 리뷰를 크롤링으로해 csv 파일 하나를 만들었고 csv파일을 데이터프레임은 위와 같은 코드로 성공했습니다. 이후 이제 전체 문서 집합에서 2번 이상 등장한 단어들을 대상으로 문서-단어 행렬을 구축을 위해 다음 코드를 실행하였습니다.

    import MeCab
    import os
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    import matplotlib.pyplot as plt
    
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def getNVM_lemma(text):
        tokenizer = MeCab.Tagger()
        parsed = tokenizer.parse(text)
        word_tag = [w for w in parsed.split("\n")]
        pos = []
        tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
        for word_ in word_tag[:-2]:
            word = word_.split("\t")
            tag = word[1].split(",")
            if(len(word[0]) < 2):
                continue
            if(tag[-1] != '*'):
                t = tag[-1].split('/')
                if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                    pos.append(t[0])
                else:
                    if(tag[0] in tags):
                        pos.append(word[0])
            return pos
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
            docs = pd.concat(reviews, ignore_index=True)
            tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
            dtm = tf_vect.fit_transform(docs['내용'])
    
    

    이때

    Traceback (most recent call last):
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
        main()
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
        dtm = tf_vect.fit_transform(docs['내용'])
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
        self.fixed_vocabulary_)
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
        for feature in analyze(doc):
    TypeError: 'NoneType' object is not iterable
    
    Process finished with exit code 1
    

    이와 같은 오류가 발생하는데 어떻게하면 될까요?

  • 프로필 최준호님의 편집
    날짜2020.06.22

    파이썬 Mecab 문서-단어 행렬 실행시 오류발생 도와주시면 감사하겠습니다.


    import os 
    import pandas as pd
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
            docs = pd.concat(reviews, ignore_index=True)
            print(docs)
    
    if __name__=='__main__':
        main()
    

    현재구글 스토어 리뷰를 크롤링으로해 csv 파일 하나를 만들었고 csv파일을 데이터프레임은 위와 같은 코드로 성공했습니다. 이후 이제 전체 문서 집합에서 2번 이상 등장한 단어들을 대상으로 문서-단어 행렬을 구축을 위해 다음 코드를 실행하였습니다.

    import MeCab
    import os
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    import matplotlib.pyplot as plt
    
    def searchFiles(path):
        filelist = []
        filenames = os.listdir(path)
        for filename in filenames:
            file_path = os.path.join(path, filename)
            filelist.append(file_path)
        return filelist
    
    def getNVM_lemma(text):
        tokenizer = MeCab.Tagger()
        parsed = tokenizer.parse(text)
        word_tag = [w for w in parsed.split("\n")]
        pos = []
        tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
        for word_ in word_tag[:-2]:
            word = word_.split("\t")
            tag = word[1].split(",")
            if(len(word[0]) < 2):
                continue
            if(tag[-1] != '*'):
                t = tag[-1].split('/')
                if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                    pos.append(t[0])
                else:
                    if(tag[0] in tags):
                        pos.append(word[0])
            return pos
    
    def main():
        reviews = []
        for filePath in searchFiles('./Reviews/'):
            review = pd.read_csv(filePath, encoding = 'utf-8')
            reviews.append(review)
        docs = pd.concat(reviews, ignore_index=True)
        tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
        dtm = tf_vect.fit_transform(docs['내용'])
    if __name__ == '__main__':
        main()
    

    이때

    Traceback (most recent call last):
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
        main()
      File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
        dtm = tf_vect.fit_transform(docs['내용'])
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
        self.fixed_vocabulary_)
      File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
        for feature in analyze(doc):
    TypeError: 'NoneType' object is not iterable
    
    Process finished with exit code 1
    

    이와 같은 오류가 발생하는데 어떻게하면 될까요?