from PTTData import PTTData from pprint import pprint from datetime import datetime from progressbar import ProgressBar import json import csv import io data = PTTData('Gossiping') lastUpdate: datetime = None mostFrequentAuthor: str = None defaultStopWords = [] with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file: for word in file.readlines(): word = word.strip() defaultStopWords.append(word) def getMostFrequentAuthor(title: str = None): global mostFrequentAuthor, lastUpdate if (lastUpdate == datetime.today().date()): yield mostFrequentAuthor elif (mostFrequentAuthor != None): yield mostFrequentAuthor authorList = data.pushCllc.aggregate(pipeline=[{'$group': { '_id': '$author', 'count': { '$sum': 1 } }}]) lastUpdate = datetime.today().date() mostFrequentAuthor = max(authorList, key=lambda x: x['count'])['_id'] yield mostFrequentAuthor return def possegPushes(ids: list, stopwords: list, keyword: str): possegs = data.pushPossegCllc.find({'ID': {'$in': ids}}) result = [] for index, p in enumerate(possegs): words = [i[1] for i in p['content'] if i[0] not in [ 'eng', 'x', 'm']] if(keyword == '' or keyword in words): result.append({ 'posString': ' '.join(words), 'ID': p['ID'] }) return result def findAuthorPush(author: list = None, aid: list = None, keyword: str = '', stopwords: list = []): terms = {} if (author != [''] and author != None): terms['author'] = { '$in': author } if (aid != [''] and aid != None): terms['postAid'] = { '$in': aid } print(terms) pushes = data.pushCllc.find(terms) pushId = [] pushContent = {} for p in pushes: pushId.append(p['_id']) pushContent[str(p['_id'])] = { 'title': p['title'], 'author': p['author'], 'pushes': p['pushes'] } possegList = possegPushes(pushId, stopwords, keyword) possegResult = [['id', 'text', 'count']] for index, n in enumerate(possegList): if(str(n['ID']) in pushContent.keys()): pushContent[str(n['ID'])]['part'] = str(n['posString']) possegResult.append([index, n['posString'], 3000]) jsonString = json.dumps( [i for i in pushContent.values()], indent=4, ensure_ascii=False) with io.StringIO() as f: writer = csv.writer(f, delimiter='\t') writer.writerows(possegResult) csvString = f.getvalue() return (csvString, jsonString) if __name__ == "__main__": pprint(findAuthorPush(['gwenwoo']))