You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ptt-sententree/dataHandlerPTTPush.py

90 lines
2.7 KiB

from PTTData import PTTData
from pprint import pprint
from datetime import datetime
from progressbar import ProgressBar
import json
import csv
import io
data = PTTData('Gossiping')
lastUpdate: datetime = None
mostFrequentAuthor: str = None
defaultStopWords = []
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
for word in file.readlines():
word = word.strip()
defaultStopWords.append(word)
def getMostFrequentAuthor(title: str = None):
global mostFrequentAuthor, lastUpdate
if (lastUpdate == datetime.today().date()):
yield mostFrequentAuthor
elif (mostFrequentAuthor != None):
yield mostFrequentAuthor
authorList = data.pushCllc.aggregate(pipeline=[{'$group': {
'_id': '$author',
'count': {
'$sum': 1
}
}}])
lastUpdate = datetime.today().date()
mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id']
yield mostFrequentAuthor
return
def possegPushes(ids: list, stopwords: list, keyword: str):
possegs = data.pushPossegCllc.find({'ID': {'$in': ids}})
result = []
for index, p in enumerate(possegs):
words = [i[1] for i in p['content'] if i[0] not in [
'eng', 'x', 'm']]
if(keyword == '' or keyword in words):
result.append({
'posString': ' '.join(words),
'ID': p['ID']
})
return result
def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []):
terms = {}
if (author != [''] and author != None):
terms['author'] = {
'$in': author
}
if (aid != [''] and aid != None):
terms['postAid'] = {
'$in': aid
}
print(terms)
pushes = data.pushCllc.find(terms)
pushId = []
pushContent = {}
for p in pushes:
pushId.append(p['_id'])
pushContent[str(p['_id'])] = {
'title': p['title'],
'author': p['author'],
'pushes': p['pushes']
}
possegList = possegPushes(pushId, stopwords, keyword)
possegResult = [['id', 'text', 'count']]
for index, n in enumerate(possegList):
if(str(n['ID']) in pushContent.keys()):
pushContent[str(n['ID'])]['part'] = str(n['posString'])
possegResult.append([index, n['posString'], 3000])
jsonString = json.dumps(
[i for i in pushContent.values()], indent=4, ensure_ascii=False)
with io.StringIO() as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(possegResult)
csvString = f.getvalue()
return (csvString, jsonString)
if __name__ == "__main__":
pprint(findAuthorPush(['gwenwoo']))