ptt-sententree/generalText.py

import jieba
import csv
import nltk
import re
import json
from jieba import posseg
from nltk import tokenize
from langdetect import detect

stopWords = []
stopWords.append('\n')
stopWords.append(' ')

with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
    for word in file.readlines():
        word = word.strip()
        stopWords.append(word)


def filterPOS(text):
    print(text)
    cuttedWithPOS = posseg.cut(text)
    cutted = [i.word for i in cuttedWithPOS if (
        i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')]
    stopped = [i for i in cutted]
    print(stopped)
    return stopped


def processText(randId, text, stopwords):
    if (text == ''):
        return ''
    lang = detect(text)
    sentenses = []
    sentenses_raw = []
    print(lang)
    if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
        splitted = re.split('。|[\n]+', text)
        print(splitted)
        cutted = []
        for spl in splitted:
            cutted.append(filterPOS(spl))
        print(cutted)
        for spl, raw in zip(cutted, splitted):
            sentenses.append(' '.join(spl))
            sentenses_raw.append(raw)
    else:
        sentenses = []
        sentenses_raw = []
        for sentence_raw in tokenize.sent_tokenize(text):
            words = sentence_raw.lower().split(' ')
            print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
            sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
            sentenses.append(sentence)
            sentenses_raw.append(sentence_raw)
    result = []
    for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
        sentence, sentence_raw = raw_pair
        result.append({
            'id': index,
            'text': sentence,
            'count': 10,
            'rawtxt': sentence_raw,
        })

    with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
        json.dump(result, fp, ensure_ascii=False, indent=4)
    return ('data/' + randId + '.json')
First commit 5 years ago			`import jieba`
			`import csv`
			`import nltk`
			`import re`
加入ratio調整及滑鼠事件 2 years ago			`import json`
First commit 5 years ago			`from jieba import posseg`
			`from nltk import tokenize`
			`from langdetect import detect`

			`stopWords = []`
			`stopWords.append('\n')`
			`stopWords.append(' ')`

			`with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:`
			`for word in file.readlines():`
			`word = word.strip()`
			`stopWords.append(word)`


			`def filterPOS(text):`
			`print(text)`
			`cuttedWithPOS = posseg.cut(text)`
			`cutted = [i.word for i in cuttedWithPOS if (`
			`i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')]`
			`stopped = [i for i in cutted]`
			`print(stopped)`
			`return stopped`


			`def processText(randId, text, stopwords):`
加入ratio調整及滑鼠事件 2 years ago			`if (text == ''):`
First commit 5 years ago			`return ''`
			`lang = detect(text)`
			`sentenses = []`
加入ratio調整及滑鼠事件 2 years ago			`sentenses_raw = []`
First commit 5 years ago			`print(lang)`
			`if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):`
			`splitted = re.split('。\|[\n]+', text)`
			`print(splitted)`
			`cutted = []`
加入ratio調整及滑鼠事件 2 years ago			`for spl in splitted:`
			`cutted.append(filterPOS(spl))`
First commit 5 years ago			`print(cutted)`
加入ratio調整及滑鼠事件 2 years ago			`for spl, raw in zip(cutted, splitted):`
			`sentenses.append(' '.join(spl))`
			`sentenses_raw.append(raw)`
First commit 5 years ago			`else:`
			`sentenses = []`
加入ratio調整及滑鼠事件 2 years ago			`sentenses_raw = []`
			`for sentence_raw in tokenize.sent_tokenize(text):`
			`words = sentence_raw.lower().split(' ')`
First commit 5 years ago			`print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])`
			`sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])`
			`sentenses.append(sentence)`
加入ratio調整及滑鼠事件 2 years ago			`sentenses_raw.append(sentence_raw)`
First commit 5 years ago			`result = []`
加入ratio調整及滑鼠事件 2 years ago			`for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):`
			`sentence, sentence_raw = raw_pair`
			`result.append({`
			`'id': index,`
			`'text': sentence,`
			`'count': 10,`
			`'rawtxt': sentence_raw,`
			`})`

			`with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:`
			`json.dump(result, fp, ensure_ascii=False, indent=4)`
			`return ('data/' + randId + '.json')`