ptt-sententree/generalText.py

import jieba
import csv
import nltk
import re
from jieba import posseg
from nltk import tokenize
from langdetect import detect

stopWords = []
stopWords.append('\n')
stopWords.append(' ')

with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
    for word in file.readlines():
        word = word.strip()
        stopWords.append(word)


def filterPOS(text):
    print(text)
    cuttedWithPOS = posseg.cut(text)
    cutted = [i.word for i in cuttedWithPOS if (
        i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')]
    stopped = [i for i in cutted]
    print(stopped)
    return stopped


def processText(randId, text, stopwords):
    if(text == ''):
        return ''
    lang = detect(text)
    sentenses = []
    print(lang)
    if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
        splitted = re.split('。|[\n]+', text)
        print(splitted)
        cutted = []
        for i in splitted:
            cutted.append(filterPOS(i))
        print(cutted)
        for i in cutted:
            result = []
            for j in i:
                if (j in stopwords):
                    continue
                result.append(j)
                if (len(result) >= 20):
                    sentenses.append(' '.join(result.copy()))
                    result = []
            if (result != []):
                sentenses.append(' '.join(result))
    else:
        sentenses = []
        for sentence in tokenize.sent_tokenize(text):
            words = sentence.lower().split(' ')
            print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
            sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
            sentenses.append(sentence)
    result = []
    result.append(['id', 'text', 'count'])
    for index, sentence in enumerate(sentenses):
        result.append([index, sentence, 1000])
    with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(result)
        f.close()
    return ('data/' + randId + '.tsv')