ptt-sententree/dataHandlerPTT.py

import sys
import threading
import jieba
import re
import csv
import json
import _pickle as pickle
import os
import io
import jieba
import multiprocessing as mp
from time import time, sleep
from functools import partial
from numpy import prod
from jieba import posseg
from progressbar import ProgressBar
from datetime import datetime
from PTTData import PTTData


defaultDate = {
    'startDate': None,
    'endDate': None
}

postContents = None
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
    postContents = pickle.load(f)
    f.close()

defaultStopWords = []
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')

sentence_length = 100
use_push_count = False

with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
    for word in file.readlines():
        word = word.strip()
        defaultStopWords.append(word)


def calcKey(startDate, endDate, keyword, pos):
    hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \
        str(hex(hash(endDate) + sys.maxsize)
            [2:]) + str(hex(hash(keyword) + sys.maxsize)[2:])
    for key, val in pos.items():
        hashKey += ('1' if val else '0')
    return hashKey


def contentProcess(content, text):
    aid = text[0]
    text = text[1]
    if (content['keyword'] != ''):
        if (content['keyword'] not in text):
            return None
    cutted = filterPOS(content, aid)
    sentenses = []
    if (content['keyword'] != ''):
        pos = [i for i, n in enumerate(cutted) if n == content['keyword']]
        resultPos = []
        for i in pos:
            for j in range(i - 9, i + 10):
                if (j >= 0 and j < len(cutted) and j not in resultPos):
                    resultPos.append(j)
        lastPos = -1
        result = []
        for i in sorted(resultPos):
            if (i - lastPos != 1 and lastPos != -1):
                sentenses.append(result)
                result = []
            result.append(cutted[i])
            lastPos = i
        sentenses.append(result)
    else:
        result = []
        for i in cutted:
            result.append(i)
            if (len(result) >= sentence_length):
                sentenses.append(result.copy())
                result = []
        if (result != []):
            sentenses.append(result)
    print(sentenses)
    return sentenses


def filterPOS(content, aid):
    if (prod(list(content['pos'].values())) == False):
        pos = content['pos']
        cuttedWithPOS = data.posseg(aid)
        startTime = time()
        cutted = []
        for i in cuttedWithPOS:
            if (i['flag'][0] == 'n' or i['flag'][0] == 'N'):
                if (pos['noun']):
                    cutted.append(i['word'])
            elif (i['flag'][0] == 'v' or (i['flag'][0] == 'V' and i['flag'] != 'Vi')):
                if (pos['verb']):
                    cutted.append(i['word'])
            elif (i['flag'] == 'Vi'):
                if (pos['adj']):
                    cutted.append(i['word'])
            elif (i['flag'] == 'ADV'):
                if (pos['adv']):
                    cutted.append(i['word'])
            elif (i['flag'] == 'r'):
                if (pos['pron']):
                    cutted.append(i['word'])
            elif (i['flag'] == 'POST' or i['flag'] == 'T'):
                if (pos['aux']):
                    cutted.append(i['word'])
            else:
                if (pos['other']):
                    if(i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm'):
                        cutted.append(i['word'])
                else:
                    if (i['word'] == content['keyword']):
                        cutted.append(i['word'])
    else:
        cuttedWithPOS = data.posseg(aid)
        cutted = [i['word'] for i in cuttedWithPOS if (
            i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm')]
    if('stopwords' in content):
        stopwords = content['stopwords']
    else:
        stopwords = defaultStopWords
    stopped = [i for i in cutted]  # 不在server端刪除停用詞
    return stopped


def findKeywordFrequency(content):
    startDate = datetime.strptime(
        content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
    endDate = datetime.strptime(
        content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999'
    result = {
        'wordCount': 0,
        'postCount': 0
    }
    for i in postContents:
        if (i['date'] > endDate or i['date'] < startDate):
            continue
        if (content['globKeyword'] not in i['content']):
            continue
        counts = len(re.findall(content['keyword'], i['content']))
        if (counts > 0):
            result['wordCount'] += counts
            result['postCount'] += 1
    return result


def findRange(wordList: list, text: str):
    top = wordList[0]
    bot = wordList[-1]
    mid = wordList[1:-1]
    pTop = text.find(wordList[0])
    pBot = text.rfind(wordList[-1])
    while (True):
        topTemp = text.find(wordList[0], pTop + 1)
        if (topTemp == -1):
            break
        skip = False
        for w in mid:
            if (text.find(w, topTemp, pBot) == -1):
                skip = True
                break
        if (skip):
            break
        pTop = topTemp
    while (True):
        botTemp = text.rfind(bot, pTop,  pBot)
        if (botTemp == -1):
            break
        skip = False
        for w in mid:
            if (text.find(w, pTop, botTemp) == -1):
                skip = True
                break
        if (skip):
            break
        pBot = botTemp
    return (pTop, pBot)


def findResult(content):
    timeStart = time()
    key = calcKey(content['startDate'],
                  content["endDate"], content["keyword"], content["pos"])
    startDate = datetime.strptime(
        content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
    endDate = datetime.strptime(
        content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999'
    resultPath = 'data/'
    counter = 0
    total = len(postContents)
    result = []
    result.append(['id', 'text', 'count'])
    filtered = []
    titles = {
        'info': {
            'keyword': content['keyword'],
            'count': 0,
            'posts': 0
        }
    }
    for i in postContents:
        if (i['date'] > endDate or i['date'] < startDate):
            continue
        counter += 1
        if (content['keyword'] != ''):
            if (content['keyword'] in i['content']):
                filtered.append(i)
            titles['info']['count'] += len(
                re.findall(content['keyword'], i['content']))

        else:
            filtered.append(i)
    titles['info']['posts'] = len(filtered)
    filtered = [i for i in sorted(
        filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]]
    print('到第一步為止生成花費', int(time()-timeStart), '秒')
    counter = 0
    total = len(filtered)
    sentensesList = []
    if(os.name == 'posix'):
        with mp.Pool(mp.cpu_count()) as pool:
            processes = pool.map_async(
                partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered])
            sentensesList = processes.get()
    else:
        sentensesList = map(partial(contentProcess, content), [
                            (i['aid'], i['content']) for i in filtered])
    for index, i in enumerate(filtered):
        counter += 1
        sentenses = sentensesList[index]
        if (sum([len(i) for i in sentenses]) == 0):
            continue
        for j in sentenses:
            cut = findRange(j, i['content'])
            seq = ' '.join(j)
            if (seq not in titles):
                titles[seq] = {
                    'title': i['title'],
                    'url': i['url'],
                    'pushes': i['pushes'],
                    'author': i['author'],
                    'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
                    'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
                }
            result.append([len(result), seq, 1000 + i['pushes']])
    print('到第二步為止生成花費', int(time()-timeStart), '秒')
    fileString = io.StringIO()
    writer = csv.writer(fileString, delimiter='\t')
    writer.writerows(result)
    csvString = fileString.getvalue()
    jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
    print('tsv', '生成花費', int(time() - timeStart), '秒')
    return {
        'tsv': csvString,
        'json': jsonString,
        'stopWords': defaultStopWords
    }


def loadPostContents():
    global postContents
    with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
        postContents = pickle.load(f)
        f.close()


def getDefault(startDate, endDate):
    global defaultDate
    if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']):
        loadPostContents()
        defaultDate['startDate'] = startDate
        defaultDate['endDate'] = endDate
    print('更新預設資料')
    timeStart = time()
    resultPath = 'data/'
    startDate = datetime.strptime(
        startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000'
    endDate = datetime.strptime(
        endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999'
    counter = 0
    total = len(postContents)
    result = []
    result.append(['id', 'text', 'count'])
    titles = {}
    titles['info'] = {
        'keyword': '',
        'counts': 0,
        'posts': 0
    }
    filtered = []
    for i in postContents:
        if (i['date'] > endDate or i['date'] < startDate):
            continue
        filtered.append(i)
    titles['info']['posts'] = len(filtered)
    filtered = [i for i in sorted(
        filtered, key=lambda x: x['pushes'], reverse=True)[:30]]
    counter = 0
    total = len(postContents)
    content = {
        'keyword': '',
        'pos': {
            'other': True
        }
    }
    for i in filtered:
        counter += 1
        post = data[i]
        sentenses = contentProcess(content, (i['aid'], i['content']))
        for j in sentenses:
            cut = findRange(j, i['content'])
            seq = ' '.join(j)
            if (seq not in titles):
                titles[seq] = {
                    'title': i['title'],
                    'url': i['url'],
                    'pushes': i['pushes'],
                    'author': i['author'],
                    'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
                    'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
                }
            result.append([len(result), seq, (1000 + i['pushes'])if use_push_count else 3000])

    fileString = io.StringIO()
    writer = csv.writer(fileString, delimiter='\t')
    writer.writerows(result)
    csvString = fileString.getvalue()
    jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
    print('tsv', '生成花費', int(time() - timeStart), '秒')
    return {
        'tsv': csvString,
        'json': jsonString,
        'stopWords': defaultStopWords
    }
First commit 5 years ago			`import sys`
			`import threading`
			`import jieba`
			`import re`
			`import csv`
			`import json`
			`import _pickle as pickle`
			`import os`
			`import io`
			`import jieba`
			`import multiprocessing as mp`
			`from time import time, sleep`
			`from functools import partial`
			`from numpy import prod`
			`from jieba import posseg`
			`from progressbar import ProgressBar`
			`from datetime import datetime`
			`from PTTData import PTTData`


			`defaultDate = {`
			`'startDate': None,`
			`'endDate': None`
			`}`

			`postContents = None`
			`with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:`
			`postContents = pickle.load(f)`
			`f.close()`

			`defaultStopWords = []`
			`data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')`

哈哈 4 years ago			`sentence_length = 100`
			`use_push_count = False`
First commit 5 years ago
			`with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:`
			`for word in file.readlines():`
			`word = word.strip()`
			`defaultStopWords.append(word)`


			`def calcKey(startDate, endDate, keyword, pos):`
			`hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \`
			`str(hex(hash(endDate) + sys.maxsize)`
			`[2:]) + str(hex(hash(keyword) + sys.maxsize)[2:])`
			`for key, val in pos.items():`
			`hashKey += ('1' if val else '0')`
			`return hashKey`


			`def contentProcess(content, text):`
			`aid = text[0]`
			`text = text[1]`
			`if (content['keyword'] != ''):`
			`if (content['keyword'] not in text):`
			`return None`
			`cutted = filterPOS(content, aid)`
			`sentenses = []`
			`if (content['keyword'] != ''):`
			`pos = [i for i, n in enumerate(cutted) if n == content['keyword']]`
			`resultPos = []`
			`for i in pos:`
			`for j in range(i - 9, i + 10):`
			`if (j >= 0 and j < len(cutted) and j not in resultPos):`
			`resultPos.append(j)`
			`lastPos = -1`
			`result = []`
			`for i in sorted(resultPos):`
			`if (i - lastPos != 1 and lastPos != -1):`
			`sentenses.append(result)`
			`result = []`
			`result.append(cutted[i])`
			`lastPos = i`
			`sentenses.append(result)`
			`else:`
			`result = []`
			`for i in cutted:`
			`result.append(i)`
哈哈 4 years ago			`if (len(result) >= sentence_length):`
First commit 5 years ago			`sentenses.append(result.copy())`
			`result = []`
			`if (result != []):`
			`sentenses.append(result)`
			`print(sentenses)`
			`return sentenses`


			`def filterPOS(content, aid):`
			`if (prod(list(content['pos'].values())) == False):`
			`pos = content['pos']`
			`cuttedWithPOS = data.posseg(aid)`
			`startTime = time()`
			`cutted = []`
			`for i in cuttedWithPOS:`
根據資料庫做的修改做修正 5 years ago			`if (i['flag'][0] == 'n' or i['flag'][0] == 'N'):`
First commit 5 years ago			`if (pos['noun']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
			`elif (i['flag'][0] == 'v' or (i['flag'][0] == 'V' and i['flag'] != 'Vi')):`
First commit 5 years ago			`if (pos['verb']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
			`elif (i['flag'] == 'Vi'):`
First commit 5 years ago			`if (pos['adj']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
			`elif (i['flag'] == 'ADV'):`
First commit 5 years ago			`if (pos['adv']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
			`elif (i['flag'] == 'r'):`
First commit 5 years ago			`if (pos['pron']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
			`elif (i['flag'] == 'POST' or i['flag'] == 'T'):`
First commit 5 years ago			`if (pos['aux']):`
根據資料庫做的修改做修正 5 years ago			`cutted.append(i['word'])`
First commit 5 years ago			`else:`
			`if (pos['other']):`
根據資料庫做的修改做修正 5 years ago			`if(i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm'):`
			`cutted.append(i['word'])`
First commit 5 years ago			`else:`
根據資料庫做的修改做修正 5 years ago			`if (i['word'] == content['keyword']):`
			`cutted.append(i['word'])`
First commit 5 years ago			`else:`
			`cuttedWithPOS = data.posseg(aid)`
根據資料庫做的修改做修正 5 years ago			`cutted = [i['word'] for i in cuttedWithPOS if (`
			`i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm')]`
First commit 5 years ago			`if('stopwords' in content):`
			`stopwords = content['stopwords']`
			`else:`
			`stopwords = defaultStopWords`
修改停用詞的處理方法 4 years ago			`stopped = [i for i in cutted] # 不在server端刪除停用詞`
First commit 5 years ago			`return stopped`


			`def findKeywordFrequency(content):`
			`startDate = datetime.strptime(`
			`content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000'`
			`endDate = datetime.strptime(`
			`content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999'`
			`result = {`
			`'wordCount': 0,`
			`'postCount': 0`
			`}`
			`for i in postContents:`
			`if (i['date'] > endDate or i['date'] < startDate):`
			`continue`
			`if (content['globKeyword'] not in i['content']):`
			`continue`
			`counts = len(re.findall(content['keyword'], i['content']))`
			`if (counts > 0):`
			`result['wordCount'] += counts`
			`result['postCount'] += 1`
			`return result`


			`def findRange(wordList: list, text: str):`
			`top = wordList[0]`
			`bot = wordList[-1]`
			`mid = wordList[1:-1]`
			`pTop = text.find(wordList[0])`
			`pBot = text.rfind(wordList[-1])`
			`while (True):`
			`topTemp = text.find(wordList[0], pTop + 1)`
			`if (topTemp == -1):`
			`break`
			`skip = False`
			`for w in mid:`
			`if (text.find(w, topTemp, pBot) == -1):`
			`skip = True`
			`break`
			`if (skip):`
			`break`
			`pTop = topTemp`
			`while (True):`
			`botTemp = text.rfind(bot, pTop, pBot)`
			`if (botTemp == -1):`
			`break`
			`skip = False`
			`for w in mid:`
			`if (text.find(w, pTop, botTemp) == -1):`
			`skip = True`
			`break`
			`if (skip):`
			`break`
			`pBot = botTemp`
			`return (pTop, pBot)`


			`def findResult(content):`
			`timeStart = time()`
			`key = calcKey(content['startDate'],`
			`content["endDate"], content["keyword"], content["pos"])`
			`startDate = datetime.strptime(`
			`content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000'`
			`endDate = datetime.strptime(`
			`content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999'`
			`resultPath = 'data/'`
			`counter = 0`
			`total = len(postContents)`
			`result = []`
			`result.append(['id', 'text', 'count'])`
			`filtered = []`
			`titles = {`
			`'info': {`
			`'keyword': content['keyword'],`
			`'count': 0,`
			`'posts': 0`
			`}`
			`}`
			`for i in postContents:`
			`if (i['date'] > endDate or i['date'] < startDate):`
			`continue`
			`counter += 1`
			`if (content['keyword'] != ''):`
			`if (content['keyword'] in i['content']):`
			`filtered.append(i)`
			`titles['info']['count'] += len(`
			`re.findall(content['keyword'], i['content']))`

			`else:`
			`filtered.append(i)`
			`titles['info']['posts'] = len(filtered)`
			`filtered = [i for i in sorted(`
修改停用詞的處理方法 4 years ago			`filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]]`
First commit 5 years ago			`print('到第一步為止生成花費', int(time()-timeStart), '秒')`
			`counter = 0`
			`total = len(filtered)`
			`sentensesList = []`
			`if(os.name == 'posix'):`
			`with mp.Pool(mp.cpu_count()) as pool:`
			`processes = pool.map_async(`
			`partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered])`
			`sentensesList = processes.get()`
			`else:`
			`sentensesList = map(partial(contentProcess, content), [`
			`(i['aid'], i['content']) for i in filtered])`
			`for index, i in enumerate(filtered):`
			`counter += 1`
			`sentenses = sentensesList[index]`
			`if (sum([len(i) for i in sentenses]) == 0):`
			`continue`
			`for j in sentenses:`
			`cut = findRange(j, i['content'])`
			`seq = ' '.join(j)`
			`if (seq not in titles):`
			`titles[seq] = {`
			`'title': i['title'],`
			`'url': i['url'],`
			`'pushes': i['pushes'],`
			`'author': i['author'],`
			`'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),`
			`'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')`
			`}`
			`result.append([len(result), seq, 1000 + i['pushes']])`
			`print('到第二步為止生成花費', int(time()-timeStart), '秒')`
			`fileString = io.StringIO()`
			`writer = csv.writer(fileString, delimiter='\t')`
			`writer.writerows(result)`
			`csvString = fileString.getvalue()`
			`jsonString = json.dumps(titles, ensure_ascii=False, indent=4)`
			`print('tsv', '生成花費', int(time() - timeStart), '秒')`
			`return {`
			`'tsv': csvString,`
			`'json': jsonString,`
			`'stopWords': defaultStopWords`
			`}`


			`def loadPostContents():`
			`global postContents`
			`with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:`
			`postContents = pickle.load(f)`
			`f.close()`


			`def getDefault(startDate, endDate):`
			`global defaultDate`
			`if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']):`
Fix bug 5 years ago			`loadPostContents()`
First commit 5 years ago			`defaultDate['startDate'] = startDate`
			`defaultDate['endDate'] = endDate`
			`print('更新預設資料')`
			`timeStart = time()`
			`resultPath = 'data/'`
			`startDate = datetime.strptime(`
			`startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000'`
			`endDate = datetime.strptime(`
			`endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999'`
			`counter = 0`
			`total = len(postContents)`
			`result = []`
			`result.append(['id', 'text', 'count'])`
			`titles = {}`
			`titles['info'] = {`
			`'keyword': '',`
			`'counts': 0,`
			`'posts': 0`
			`}`
			`filtered = []`
			`for i in postContents:`
			`if (i['date'] > endDate or i['date'] < startDate):`
			`continue`
			`filtered.append(i)`
			`titles['info']['posts'] = len(filtered)`
			`filtered = [i for i in sorted(`
修改停用詞的處理方法 4 years ago			`filtered, key=lambda x: x['pushes'], reverse=True)[:30]]`
First commit 5 years ago			`counter = 0`
			`total = len(postContents)`
			`content = {`
			`'keyword': '',`
			`'pos': {`
			`'other': True`
			`}`
			`}`
			`for i in filtered:`
			`counter += 1`
			`post = data[i]`
			`sentenses = contentProcess(content, (i['aid'], i['content']))`
			`for j in sentenses:`
			`cut = findRange(j, i['content'])`
			`seq = ' '.join(j)`
			`if (seq not in titles):`
			`titles[seq] = {`
			`'title': i['title'],`
			`'url': i['url'],`
			`'pushes': i['pushes'],`
			`'author': i['author'],`
			`'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),`
			`'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')`
			`}`
哈哈 4 years ago			`result.append([len(result), seq, (1000 + i['pushes'])if use_push_count else 3000])`
First commit 5 years ago
			`fileString = io.StringIO()`
			`writer = csv.writer(fileString, delimiter='\t')`
			`writer.writerows(result)`
			`csvString = fileString.getvalue()`
			`jsonString = json.dumps(titles, ensure_ascii=False, indent=4)`
			`print('tsv', '生成花費', int(time() - timeStart), '秒')`
			`return {`
			`'tsv': csvString,`
			`'json': jsonString,`
			`'stopWords': defaultStopWords`
			`}`