You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
90 lines
2.7 KiB
90 lines
2.7 KiB
from PTTData import PTTData
|
|
from pprint import pprint
|
|
from datetime import datetime
|
|
from progressbar import ProgressBar
|
|
import json
|
|
import csv
|
|
import io
|
|
|
|
data = PTTData('Gossiping')
|
|
lastUpdate: datetime = None
|
|
mostFrequentAuthor: str = None
|
|
|
|
defaultStopWords = []
|
|
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
|
|
for word in file.readlines():
|
|
word = word.strip()
|
|
defaultStopWords.append(word)
|
|
|
|
|
|
def getMostFrequentAuthor(title: str = None):
|
|
global mostFrequentAuthor, lastUpdate
|
|
if (lastUpdate == datetime.today().date()):
|
|
yield mostFrequentAuthor
|
|
elif (mostFrequentAuthor != None):
|
|
yield mostFrequentAuthor
|
|
authorList = data.pushCllc.aggregate(pipeline=[{'$group': {
|
|
'_id': '$author',
|
|
'count': {
|
|
'$sum': 1
|
|
}
|
|
}}])
|
|
lastUpdate = datetime.today().date()
|
|
mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id']
|
|
yield mostFrequentAuthor
|
|
return
|
|
|
|
|
|
def possegPushes(ids: list, stopwords: list, keyword: str):
|
|
possegs = data.pushPossegCllc.find({'ID': {'$in': ids}})
|
|
result = []
|
|
for index, p in enumerate(possegs):
|
|
words = [i[1] for i in p['content'] if i[0] not in [
|
|
'eng', 'x', 'm']]
|
|
if(keyword == '' or keyword in words):
|
|
result.append({
|
|
'posString': ' '.join(words),
|
|
'ID': p['ID']
|
|
})
|
|
return result
|
|
|
|
|
|
def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []):
|
|
terms = {}
|
|
if (author != [''] and author != None):
|
|
terms['author'] = {
|
|
'$in': author
|
|
}
|
|
if (aid != [''] and aid != None):
|
|
terms['postAid'] = {
|
|
'$in': aid
|
|
}
|
|
print(terms)
|
|
pushes = data.pushCllc.find(terms)
|
|
pushId = []
|
|
pushContent = {}
|
|
for p in pushes:
|
|
pushId.append(p['_id'])
|
|
pushContent[str(p['_id'])] = {
|
|
'title': p['title'],
|
|
'author': p['author'],
|
|
'pushes': p['pushes']
|
|
}
|
|
possegList = possegPushes(pushId, stopwords, keyword)
|
|
possegResult = [['id', 'text', 'count']]
|
|
for index, n in enumerate(possegList):
|
|
if(str(n['ID']) in pushContent.keys()):
|
|
pushContent[str(n['ID'])]['part'] = str(n['posString'])
|
|
possegResult.append([index, n['posString'], 3000])
|
|
jsonString = json.dumps(
|
|
[i for i in pushContent.values()], indent=4, ensure_ascii=False)
|
|
with io.StringIO() as f:
|
|
writer = csv.writer(f, delimiter='\t')
|
|
writer.writerows(possegResult)
|
|
csvString = f.getvalue()
|
|
return (csvString, jsonString)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pprint(findAuthorPush(['gwenwoo']))
|