|
|
|
@ -2,6 +2,7 @@ import jieba
|
|
|
|
|
import csv
|
|
|
|
|
import nltk
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
from jieba import posseg
|
|
|
|
|
from nltk import tokenize
|
|
|
|
|
from langdetect import detect
|
|
|
|
@ -31,38 +32,37 @@ def processText(randId, text, stopwords):
|
|
|
|
|
return ''
|
|
|
|
|
lang = detect(text)
|
|
|
|
|
sentenses = []
|
|
|
|
|
sentenses_raw = []
|
|
|
|
|
print(lang)
|
|
|
|
|
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
|
|
|
|
|
splitted = re.split('。|[\n]+', text)
|
|
|
|
|
print(splitted)
|
|
|
|
|
cutted = []
|
|
|
|
|
for i in splitted:
|
|
|
|
|
cutted.append(filterPOS(i))
|
|
|
|
|
for spl in splitted:
|
|
|
|
|
cutted.append(filterPOS(spl))
|
|
|
|
|
print(cutted)
|
|
|
|
|
for i in cutted:
|
|
|
|
|
result = []
|
|
|
|
|
for j in i:
|
|
|
|
|
if (j in stopwords):
|
|
|
|
|
continue
|
|
|
|
|
result.append(j)
|
|
|
|
|
if (len(result) >= 20):
|
|
|
|
|
sentenses.append(' '.join(result.copy()))
|
|
|
|
|
result = []
|
|
|
|
|
if (result != []):
|
|
|
|
|
sentenses.append(' '.join(result))
|
|
|
|
|
for spl, raw in zip(cutted, splitted):
|
|
|
|
|
sentenses.append(' '.join(spl))
|
|
|
|
|
sentenses_raw.append(raw)
|
|
|
|
|
else:
|
|
|
|
|
sentenses = []
|
|
|
|
|
for sentence in tokenize.sent_tokenize(text):
|
|
|
|
|
words = sentence.lower().split(' ')
|
|
|
|
|
sentenses_raw = []
|
|
|
|
|
for sentence_raw in tokenize.sent_tokenize(text):
|
|
|
|
|
words = sentence_raw.lower().split(' ')
|
|
|
|
|
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
|
|
|
|
|
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
|
|
|
|
|
sentenses.append(sentence)
|
|
|
|
|
sentenses_raw.append(sentence_raw)
|
|
|
|
|
result = []
|
|
|
|
|
result.append(['id', 'text', 'count'])
|
|
|
|
|
for index, sentence in enumerate(sentenses):
|
|
|
|
|
result.append([index, sentence, 1000])
|
|
|
|
|
with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
|
|
|
|
|
writer = csv.writer(f, delimiter='\t')
|
|
|
|
|
writer.writerows(result)
|
|
|
|
|
f.close()
|
|
|
|
|
return ('data/' + randId + '.tsv')
|
|
|
|
|
for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
|
|
|
|
|
sentence, sentence_raw = raw_pair
|
|
|
|
|
result.append({
|
|
|
|
|
'id': index,
|
|
|
|
|
'text': sentence,
|
|
|
|
|
'count': 10,
|
|
|
|
|
'rawtxt': sentence_raw,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
|
|
|
|
|
json.dump(result, fp, ensure_ascii=False, indent=4)
|
|
|
|
|
return ('data/' + randId + '.json')
|
|
|
|
|