import jieba import csv import nltk import re from jieba import posseg from nltk import tokenize from langdetect import detect stopWords = [] stopWords.append('\n') stopWords.append(' ') with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file: for word in file.readlines(): word = word.strip() stopWords.append(word) def filterPOS(text): print(text) cuttedWithPOS = posseg.cut(text) cutted = [i.word for i in cuttedWithPOS if ( i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')] stopped = [i for i in cutted] print(stopped) return stopped def processText(randId, text, stopwords): if(text == ''): return '' lang = detect(text) sentenses = [] print(lang) if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): splitted = re.split('。|[\n]+', text) print(splitted) cutted = [] for i in splitted: cutted.append(filterPOS(i)) print(cutted) for i in cutted: result = [] for j in i: if (j in stopwords): continue result.append(j) if (len(result) >= 20): sentenses.append(' '.join(result.copy())) result = [] if (result != []): sentenses.append(' '.join(result)) else: sentenses = [] for sentence in tokenize.sent_tokenize(text): words = sentence.lower().split(' ') print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentenses.append(sentence) result = [] result.append(['id', 'text', 'count']) for index, sentence in enumerate(sentenses): result.append([index, sentence, 1000]) with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f: writer = csv.writer(f, delimiter='\t') writer.writerows(result) f.close() return ('data/' + randId + '.tsv')