ptt-sententree/dataHandlerPTTPush.py

from PTTData import PTTData
from pprint import pprint
from datetime import datetime
from progressbar import ProgressBar
import json
import csv
import io

data = PTTData('Gossiping')
lastUpdate: datetime = None
mostFrequentAuthor: str = None

defaultStopWords = []
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
    for word in file.readlines():
        word = word.strip()
        defaultStopWords.append(word)


def getMostFrequentAuthor(title: str = None):
    global mostFrequentAuthor, lastUpdate
    if (lastUpdate == datetime.today().date()):
        yield mostFrequentAuthor
    elif (mostFrequentAuthor != None):
        yield mostFrequentAuthor
    authorList = data.pushCllc.aggregate(pipeline=[{'$group': {
        '_id': '$author',
        'count': {
            '$sum': 1
        }
    }}])
    lastUpdate = datetime.today().date()
    mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id']
    yield mostFrequentAuthor
    return


def possegPushes(ids: list, stopwords: list, keyword: str):
    possegs = data.pushPossegCllc.find({'ID': {'$in': ids}})
    result = []
    for index, p in enumerate(possegs):
        words = [i[1] for i in p['content'] if i[0] not in [
            'eng', 'x', 'm']]
        if(keyword == '' or keyword in words):
            result.append({
                'posString': ' '.join(words),
                'ID': p['ID']
            })
    return result


def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []):
    terms = {}
    if (author != [''] and author != None):
        terms['author'] = {
            '$in': author
        }
    if (aid != [''] and aid != None):
        terms['postAid'] = {
            '$in': aid
        }
    print(terms)
    pushes = data.pushCllc.find(terms)
    pushId = []
    pushContent = {}
    for p in pushes:
        pushId.append(p['_id'])
        pushContent[str(p['_id'])] = {
            'title': p['title'],
            'author': p['author'],
            'pushes': p['pushes']
        }
    possegList = possegPushes(pushId, stopwords, keyword)
    possegResult = [['id', 'text', 'count']]
    for index, n in enumerate(possegList):
        if(str(n['ID']) in pushContent.keys()):
            pushContent[str(n['ID'])]['part'] = str(n['posString'])
            possegResult.append([index, n['posString'], 3000])
    jsonString = json.dumps(
        [i for i in pushContent.values()], indent=4, ensure_ascii=False)
    with io.StringIO() as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(possegResult)
        csvString = f.getvalue()
    return (csvString, jsonString)


if __name__ == "__main__":
    pprint(findAuthorPush(['gwenwoo']))