You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ptt-sententree/generalText.py

69 lines
2.1 KiB

5 years ago
import jieba
import csv
import nltk
import re
import json
5 years ago
from jieba import posseg
from nltk import tokenize
from langdetect import detect
stopWords = []
stopWords.append('\n')
stopWords.append(' ')
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
for word in file.readlines():
word = word.strip()
stopWords.append(word)
def filterPOS(text):
print(text)
cuttedWithPOS = posseg.cut(text)
cutted = [i.word for i in cuttedWithPOS if (
i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')]
stopped = [i for i in cutted]
print(stopped)
return stopped
def processText(randId, text, stopwords):
if (text == ''):
5 years ago
return ''
lang = detect(text)
sentenses = []
sentenses_raw = []
5 years ago
print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text)
print(splitted)
cutted = []
for spl in splitted:
cutted.append(filterPOS(spl))
5 years ago
print(cutted)
for spl, raw in zip(cutted, splitted):
sentenses.append(' '.join(spl))
sentenses_raw.append(raw)
5 years ago
else:
sentenses = []
sentenses_raw = []
for sentence_raw in tokenize.sent_tokenize(text):
words = sentence_raw.lower().split(' ')
5 years ago
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
5 years ago
result = []
for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
sentence, sentence_raw = raw_pair
result.append({
'id': index,
'text': sentence,
'count': 10,
'rawtxt': sentence_raw,
})
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')