ptt-sententree/dataHandlerPTTPush.py

from PTTData import PTTData
from pprint import pprint
from datetime import datetime
from progressbar import ProgressBar
import json
import csv
import io

data = PTTData('Gossiping')
lastUpdate: datetime = None
mostFrequentAuthor: str = None

defaultStopWords = []
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
    for word in file.readlines():
        word = word.strip()
        defaultStopWords.append(word)


def getMostFrequentAuthor(title: str = None):
    global mostFrequentAuthor, lastUpdate
    if (lastUpdate == datetime.today().date()):
        yield mostFrequentAuthor
    elif (mostFrequentAuthor != None):
        yield mostFrequentAuthor
    authorList = data.pushCllc.aggregate(pipeline=[{'$group': {
        '_id': '$author',
        'count': {
            '$sum': 1
        }
    }}])
    lastUpdate = datetime.today().date()
    mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id']
    yield mostFrequentAuthor
    return


def possegPushes(ids: list, stopwords: list, keyword: str):
    possegs = data.pushPossegCllc.find({'ID': {'$in': ids}})
    result = []
    for index, p in enumerate(possegs):
        words = [i[1] for i in p['content'] if i[0] not in [
            'eng', 'x', 'm']]
        if(keyword == '' or keyword in words):
            result.append({
                'posString': ' '.join(words),
                'ID': p['ID']
            })
    return result


def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []):
    terms = {}
    if (author != [''] and author != None):
        terms['author'] = {
            '$in': author
        }
    if (aid != [''] and aid != None):
        terms['postAid'] = {
            '$in': aid
        }
    print(terms)
    pushes = data.pushCllc.find(terms)
    pushId = []
    pushContent = {}
    for p in pushes:
        pushId.append(p['_id'])
        pushContent[str(p['_id'])] = {
            'title': p['title'],
            'author': p['author'],
            'pushes': p['pushes']
        }
    possegList = possegPushes(pushId, stopwords, keyword)
    possegResult = [['id', 'text', 'count']]
    for index, n in enumerate(possegList):
        if(str(n['ID']) in pushContent.keys()):
            pushContent[str(n['ID'])]['part'] = str(n['posString'])
            possegResult.append([index, n['posString'], 3000])
    jsonString = json.dumps(
        [i for i in pushContent.values()], indent=4, ensure_ascii=False)
    with io.StringIO() as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(possegResult)
        csvString = f.getvalue()
    return (csvString, jsonString)


if __name__ == "__main__":
    pprint(findAuthorPush(['gwenwoo']))
Add pttPush 4 years ago			`from PTTData import PTTData`
			`from pprint import pprint`
			`from datetime import datetime`
			`from progressbar import ProgressBar`
			`import json`
			`import csv`
			`import io`

			`data = PTTData('Gossiping')`
			`lastUpdate: datetime = None`
			`mostFrequentAuthor: str = None`

			`defaultStopWords = []`
			`with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:`
			`for word in file.readlines():`
			`word = word.strip()`
			`defaultStopWords.append(word)`


			`def getMostFrequentAuthor(title: str = None):`
			`global mostFrequentAuthor, lastUpdate`
			`if (lastUpdate == datetime.today().date()):`
			`yield mostFrequentAuthor`
			`elif (mostFrequentAuthor != None):`
			`yield mostFrequentAuthor`
			`authorList = data.pushCllc.aggregate(pipeline=[{'$group': {`
			`'_id': '$author',`
			`'count': {`
			`'$sum': 1`
			`}`
			`}}])`
			`lastUpdate = datetime.today().date()`
			`mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id']`
			`yield mostFrequentAuthor`
			`return`


			`def possegPushes(ids: list, stopwords: list, keyword: str):`
			`possegs = data.pushPossegCllc.find({'ID': {'$in': ids}})`
			`result = []`
			`for index, p in enumerate(possegs):`
			`words = [i[1] for i in p['content'] if i[0] not in [`
修改停用詞的處理方法 4 years ago			`'eng', 'x', 'm']]`
Add pttPush 4 years ago			`if(keyword == '' or keyword in words):`
			`result.append({`
			`'posString': ' '.join(words),`
			`'ID': p['ID']`
			`})`
			`return result`


			`def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []):`
			`terms = {}`
			`if (author != [''] and author != None):`
			`terms['author'] = {`
			`'$in': author`
			`}`
			`if (aid != [''] and aid != None):`
			`terms['postAid'] = {`
			`'$in': aid`
			`}`
			`print(terms)`
			`pushes = data.pushCllc.find(terms)`
			`pushId = []`
			`pushContent = {}`
			`for p in pushes:`
			`pushId.append(p['_id'])`
			`pushContent[str(p['_id'])] = {`
			`'title': p['title'],`
			`'author': p['author'],`
			`'pushes': p['pushes']`
			`}`
			`possegList = possegPushes(pushId, stopwords, keyword)`
			`possegResult = [['id', 'text', 'count']]`
			`for index, n in enumerate(possegList):`
			`if(str(n['ID']) in pushContent.keys()):`
			`pushContent[str(n['ID'])]['part'] = str(n['posString'])`
			`possegResult.append([index, n['posString'], 3000])`
			`jsonString = json.dumps(`
			`[i for i in pushContent.values()], indent=4, ensure_ascii=False)`
			`with io.StringIO() as f:`
			`writer = csv.writer(f, delimiter='\t')`
			`writer.writerows(possegResult)`
			`csvString = f.getvalue()`
			`return (csvString, jsonString)`


			`if __name__ == "__main__":`
			`pprint(findAuthorPush(['gwenwoo']))`