import jieba import csv import nltk import re import json from jieba import posseg from nltk import tokenize from langdetect import detect stopWords = [] stopWords.append('\n') stopWords.append(' ') with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file: for word in file.readlines(): word = word.strip() stopWords.append(word) def filterPOS(text): print(text) cuttedWithPOS = posseg.cut(text) cutted = [i.word for i in cuttedWithPOS if ( i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')] stopped = [i for i in cutted] print(stopped) return stopped def processText(randId, text, stopwords): if (text == ''): return '' lang = detect(text) sentenses = [] sentenses_raw = [] print(lang) if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): splitted = re.split('。|[\n]+', text) print(splitted) cutted = [] for spl in splitted: cutted.append(filterPOS(spl)) print(cutted) for spl, raw in zip(cutted, splitted): sentenses.append(' '.join(spl)) sentenses_raw.append(raw) else: sentenses = [] sentenses_raw = [] for sentence_raw in tokenize.sent_tokenize(text): words = sentence_raw.lower().split(' ') print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentenses.append(sentence) sentenses_raw.append(sentence_raw) result = [] for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)): sentence, sentence_raw = raw_pair result.append({ 'id': index, 'text': sentence, 'count': 10, 'rawtxt': sentence_raw, }) with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp: json.dump(result, fp, ensure_ascii=False, indent=4) return ('data/' + randId + '.json')