import sys import threading import jieba import re import csv import json import _pickle as pickle import os import io import jieba import multiprocessing as mp from time import time, sleep from functools import partial from numpy import prod from jieba import posseg from progressbar import ProgressBar from datetime import datetime #from PTTData import PTTData defaultDate = { 'startDate': None, 'endDate': None } postContents = None with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f: postContents = pickle.load(f) f.close() defaultStopWords = [] #data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData') sentence_length = 100 use_push_count = False with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file: for word in file.readlines(): word = word.strip() defaultStopWords.append(word) def calcKey(startDate, endDate, keyword, pos): hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \ str(hex(hash(endDate) + sys.maxsize) [2:]) + str(hex(hash(keyword) + sys.maxsize)[2:]) for key, val in pos.items(): hashKey += ('1' if val else '0') return hashKey def contentProcess(content, text): aid = text[0] text = text[1] if (content['keyword'] != ''): if (content['keyword'] not in text): return None cutted = filterPOS(content, aid) sentenses = [] if (content['keyword'] != ''): pos = [i for i, n in enumerate(cutted) if n == content['keyword']] resultPos = [] for i in pos: for j in range(i - 9, i + 10): if (j >= 0 and j < len(cutted) and j not in resultPos): resultPos.append(j) lastPos = -1 result = [] for i in sorted(resultPos): if (i - lastPos != 1 and lastPos != -1): sentenses.append(result) result = [] result.append(cutted[i]) lastPos = i sentenses.append(result) else: result = [] for i in cutted: result.append(i) if (len(result) >= sentence_length): sentenses.append(result.copy()) result = [] if (result != []): sentenses.append(result) print(sentenses) return sentenses def filterPOS(content, aid): if (prod(list(content['pos'].values())) == False): pos = content['pos'] cuttedWithPOS = data.posseg(aid) startTime = time() cutted = [] for i in cuttedWithPOS: if (i['flag'][0] == 'n' or i['flag'][0] == 'N'): if (pos['noun']): cutted.append(i['word']) elif (i['flag'][0] == 'v' or (i['flag'][0] == 'V' and i['flag'] != 'Vi')): if (pos['verb']): cutted.append(i['word']) elif (i['flag'] == 'Vi'): if (pos['adj']): cutted.append(i['word']) elif (i['flag'] == 'ADV'): if (pos['adv']): cutted.append(i['word']) elif (i['flag'] == 'r'): if (pos['pron']): cutted.append(i['word']) elif (i['flag'] == 'POST' or i['flag'] == 'T'): if (pos['aux']): cutted.append(i['word']) else: if (pos['other']): if(i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm'): cutted.append(i['word']) else: if (i['word'] == content['keyword']): cutted.append(i['word']) else: cuttedWithPOS = data.posseg(aid) cutted = [i['word'] for i in cuttedWithPOS if ( i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm')] if('stopwords' in content): stopwords = content['stopwords'] else: stopwords = defaultStopWords stopped = [i for i in cutted] # 不在server端刪除停用詞 return stopped def findKeywordFrequency(content): startDate = datetime.strptime( content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999' result = { 'wordCount': 0, 'postCount': 0 } for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue if (content['globKeyword'] not in i['content']): continue counts = len(re.findall(content['keyword'], i['content'])) if (counts > 0): result['wordCount'] += counts result['postCount'] += 1 return result def findRange(wordList: list, text: str): top = wordList[0] bot = wordList[-1] mid = wordList[1:-1] pTop = text.find(wordList[0]) pBot = text.rfind(wordList[-1]) while (True): topTemp = text.find(wordList[0], pTop + 1) if (topTemp == -1): break skip = False for w in mid: if (text.find(w, topTemp, pBot) == -1): skip = True break if (skip): break pTop = topTemp while (True): botTemp = text.rfind(bot, pTop, pBot) if (botTemp == -1): break skip = False for w in mid: if (text.find(w, pTop, botTemp) == -1): skip = True break if (skip): break pBot = botTemp return (pTop, pBot) def findResult(content): timeStart = time() key = calcKey(content['startDate'], content["endDate"], content["keyword"], content["pos"]) startDate = datetime.strptime( content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999' resultPath = 'data/' counter = 0 total = len(postContents) result = [] result.append(['id', 'text', 'count']) filtered = [] titles = { 'info': { 'keyword': content['keyword'], 'count': 0, 'posts': 0 } } for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue counter += 1 if (content['keyword'] != ''): if (content['keyword'] in i['content']): filtered.append(i) titles['info']['count'] += len( re.findall(content['keyword'], i['content'])) else: filtered.append(i) titles['info']['posts'] = len(filtered) filtered = [i for i in sorted( filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]] print('到第一步為止生成花費', int(time()-timeStart), '秒') counter = 0 total = len(filtered) sentensesList = [] if(os.name == 'posix'): with mp.Pool(mp.cpu_count()) as pool: processes = pool.map_async( partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered]) sentensesList = processes.get() else: sentensesList = map(partial(contentProcess, content), [ (i['aid'], i['content']) for i in filtered]) for index, i in enumerate(filtered): counter += 1 sentenses = sentensesList[index] if (sum([len(i) for i in sentenses]) == 0): continue for j in sentenses: cut = findRange(j, i['content']) seq = ' '.join(j) if (seq not in titles): titles[seq] = { 'title': i['title'], 'url': i['url'], 'pushes': i['pushes'], 'author': i['author'], 'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"), 'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '') } result.append([len(result), seq, 1000 + i['pushes']]) print('到第二步為止生成花費', int(time()-timeStart), '秒') fileString = io.StringIO() writer = csv.writer(fileString, delimiter='\t') writer.writerows(result) csvString = fileString.getvalue() jsonString = json.dumps(titles, ensure_ascii=False, indent=4) print('tsv', '生成花費', int(time() - timeStart), '秒') return { 'tsv': csvString, 'json': jsonString, 'stopWords': defaultStopWords } def loadPostContents(): global postContents with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f: postContents = pickle.load(f) f.close() def getDefault(startDate, endDate): global defaultDate if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']): loadPostContents() defaultDate['startDate'] = startDate defaultDate['endDate'] = endDate print('更新預設資料') timeStart = time() resultPath = 'data/' startDate = datetime.strptime( startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000' endDate = datetime.strptime( endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999' counter = 0 total = len(postContents) result = [] result.append(['id', 'text', 'count']) titles = {} titles['info'] = { 'keyword': '', 'counts': 0, 'posts': 0 } filtered = [] for i in postContents: if (i['date'] > endDate or i['date'] < startDate): continue filtered.append(i) titles['info']['posts'] = len(filtered) filtered = [i for i in sorted( filtered, key=lambda x: x['pushes'], reverse=True)[:30]] counter = 0 total = len(postContents) content = { 'keyword': '', 'pos': { 'other': True } } for i in filtered: counter += 1 post = data[i] sentenses = contentProcess(content, (i['aid'], i['content'])) for j in sentenses: cut = findRange(j, i['content']) seq = ' '.join(j) if (seq not in titles): titles[seq] = { 'title': i['title'], 'url': i['url'], 'pushes': i['pushes'], 'author': i['author'], 'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"), 'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '') } result.append([len(result), seq, (1000 + i['pushes'])if use_push_count else 3000]) fileString = io.StringIO() writer = csv.writer(fileString, delimiter='\t') writer.writerows(result) csvString = fileString.getvalue() jsonString = json.dumps(titles, ensure_ascii=False, indent=4) print('tsv', '生成花費', int(time() - timeStart), '秒') return { 'tsv': csvString, 'json': jsonString, 'stopWords': defaultStopWords }