import sys import threading import jieba import re import csv import json import _pickle as pickle import os import io import jieba import multiprocessing as mp from time import time, sleep from functools import partial from numpy import prod from jieba import posseg from progressbar import ProgressBar from datetime import datetime from PTTData import PTTData defaultDate = { 'startDate': None, 'endDate': None } postContents = None with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f: postContents = pickle.load(f) f.close() defaultStopWords = [] data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData') with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file: for word in file.readlines(): word = word.strip() defaultStopWords.append(word) def calcKey(startDate, endDate, keyword, pos): hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \ str(hex(hash(endDate) + sys.maxsize) [2:]) + str(hex(hash(keyword) + sys.maxsize)[2:]) for key, val in pos.items(): hashKey += ('1' if val else '0') return hashKey def contentProcess(content, text): aid = text[0] text = text[1] if (content['keyword'] != ''): if (content['keyword'] not in text): return None cutted = filterPOS(content, aid) sentenses = [] if (content['keyword'] != ''): pos = [i for i, n in enumerate(cutted) if n == content['keyword']] resultPos = [] for i in pos: for j in range(i - 9, i + 10): if (j >= 0 and j < len(cutted) and j not in resultPos): resultPos.append(j) lastPos = -1 result = [] for i in sorted(resultPos): if (i - lastPos != 1 and lastPos != -1): sentenses.append(result) result = [] result.append(cutted[i]) lastPos = i sentenses.append(result) else: result = [] for i in cutted: result.append(i) if (len(result) >= 50): sentenses.append(result.copy()) result = [] if (result != []): sentenses.append(result) print(sentenses) return sentenses def filterPOS(content, aid): if (prod(list(content['pos'].values())) == False): pos = content['pos'] cuttedWithPOS = data.posseg(aid) startTime = time() cutted = [] for i in cuttedWithPOS: if (i.flag[0] == 'n' or i.flag[0] == 'N'): if (pos['noun']): cutted.append(i.word) elif (i.flag[0] == 'v' or (i.flag[0] == 'V' and i.flag != 'Vi')): if (pos['verb']): cutted.append(i.word) elif (i.flag == 'Vi'): if (pos['adj']): cutted.append(i.word) elif (i.flag == 'ADV'): if (pos['adv']): cutted.append(i.word) elif (i.flag == 'r'): if (pos['pron']): cutted.append(i.word) elif (i.flag == 'POST' or i.flag == 'T'): if (pos['aux']): cutted.append(i.word) else: if (pos['other']): if(i.flag != 'eng' and i.flag != 'x' and i.flag != 'm'): cutted.append(i.word) else: if (i.word == content['keyword']): cutted.append(i.word) else: cuttedWithPOS = data.posseg(aid) cutted = [i.word for i in cuttedWithPOS if ( i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')] if('stopwords' in content): stopwords = content['stopwords'] else: stopwords = defaultStopWords stopped = [i for i in cutted if i not in stopwords] return stopped def findKeywordFrequency(content): startDate = datetime.strptime( content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999' result = { 'wordCount': 0, 'postCount': 0 } for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue if (content['globKeyword'] not in i['content']): continue counts = len(re.findall(content['keyword'], i['content'])) if (counts > 0): result['wordCount'] += counts result['postCount'] += 1 return result def findRange(wordList: list, text: str): top = wordList[0] bot = wordList[-1] mid = wordList[1:-1] pTop = text.find(wordList[0]) pBot = text.rfind(wordList[-1]) while (True): topTemp = text.find(wordList[0], pTop + 1) if (topTemp == -1): break skip = False for w in mid: if (text.find(w, topTemp, pBot) == -1): skip = True break if (skip): break pTop = topTemp while (True): botTemp = text.rfind(bot, pTop, pBot) if (botTemp == -1): break skip = False for w in mid: if (text.find(w, pTop, botTemp) == -1): skip = True break if (skip): break pBot = botTemp return (pTop, pBot) def findResult(content): timeStart = time() key = calcKey(content['startDate'], content["endDate"], content["keyword"], content["pos"]) startDate = datetime.strptime( content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999' resultPath = 'data/' counter = 0 total = len(postContents) result = [] result.append(['id', 'text', 'count']) filtered = [] titles = { 'info': { 'keyword': content['keyword'], 'count': 0, 'posts': 0 } } for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue counter += 1 if (content['keyword'] != ''): if (content['keyword'] in i['content']): filtered.append(i) titles['info']['count'] += len( re.findall(content['keyword'], i['content'])) else: filtered.append(i) titles['info']['posts'] = len(filtered) filtered = [i for i in sorted( filtered, key=lambda x: x['pushes'], reverse=True)[:50]] print('到第一步為止生成花費', int(time()-timeStart), '秒') counter = 0 total = len(filtered) sentensesList = [] if(os.name == 'posix'): with mp.Pool(mp.cpu_count()) as pool: processes = pool.map_async( partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered]) sentensesList = processes.get() else: sentensesList = map(partial(contentProcess, content), [ (i['aid'], i['content']) for i in filtered]) for index, i in enumerate(filtered): counter += 1 sentenses = sentensesList[index] if (sum([len(i) for i in sentenses]) == 0): continue for j in sentenses: cut = findRange(j, i['content']) seq = ' '.join(j) if (seq not in titles): titles[seq] = { 'title': i['title'], 'url': i['url'], 'pushes': i['pushes'], 'author': i['author'], 'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"), 'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '') } result.append([len(result), seq, 1000 + i['pushes']]) print('到第二步為止生成花費', int(time()-timeStart), '秒') fileString = io.StringIO() writer = csv.writer(fileString, delimiter='\t') writer.writerows(result) csvString = fileString.getvalue() jsonString = json.dumps(titles, ensure_ascii=False, indent=4) print('tsv', '生成花費', int(time() - timeStart), '秒') return { 'tsv': csvString, 'json': jsonString, 'stopWords': defaultStopWords } def loadPostContents(): global postContents with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f: postContents = pickle.load(f) f.close() def getDefault(startDate, endDate): global defaultDate if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']): loadPostContents defaultDate['startDate'] = startDate defaultDate['endDate'] = endDate print('更新預設資料') timeStart = time() resultPath = 'data/' startDate = datetime.strptime( startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999' counter = 0 total = len(postContents) result = [] result.append(['id', 'text', 'count']) titles = {} titles['info'] = { 'keyword': '', 'counts': 0, 'posts': 0 } filtered = [] for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue filtered.append(i) titles['info']['posts'] = len(filtered) filtered = [i for i in sorted( filtered, key=lambda x: x['pushes'], reverse=True)[:50]] counter = 0 total = len(postContents) content = { 'keyword': '', 'pos': { 'other': True } } for i in filtered: counter += 1 post = data[i] sentenses = contentProcess(content, (i['aid'], i['content'])) for j in sentenses: cut = findRange(j, i['content']) seq = ' '.join(j) if (seq not in titles): titles[seq] = { 'title': i['title'], 'url': i['url'], 'pushes': i['pushes'], 'author': i['author'], 'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"), 'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '') } result.append([len(result), seq, 1000 + i['pushes']]) fileString = io.StringIO() writer = csv.writer(fileString, delimiter='\t') writer.writerows(result) csvString = fileString.getvalue() jsonString = json.dumps(titles, ensure_ascii=False, indent=4) print('tsv', '生成花費', int(time() - timeStart), '秒') return { 'tsv': csvString, 'json': jsonString, 'stopWords': defaultStopWords }