|
|
|
import sys
|
|
|
|
import threading
|
|
|
|
import jieba
|
|
|
|
import re
|
|
|
|
import csv
|
|
|
|
import json
|
|
|
|
import _pickle as pickle
|
|
|
|
import os
|
|
|
|
import io
|
|
|
|
import jieba
|
|
|
|
import multiprocessing as mp
|
|
|
|
from time import time, sleep
|
|
|
|
from functools import partial
|
|
|
|
from numpy import prod
|
|
|
|
from jieba import posseg
|
|
|
|
from progressbar import ProgressBar
|
|
|
|
from datetime import datetime
|
|
|
|
from PTTData import PTTData
|
|
|
|
|
|
|
|
|
|
|
|
defaultDate = {
|
|
|
|
'startDate': None,
|
|
|
|
'endDate': None
|
|
|
|
}
|
|
|
|
|
|
|
|
postContents = None
|
|
|
|
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
|
|
|
|
postContents = pickle.load(f)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
defaultStopWords = []
|
|
|
|
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
|
|
|
|
|
|
|
|
|
|
|
|
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
|
|
|
|
for word in file.readlines():
|
|
|
|
word = word.strip()
|
|
|
|
defaultStopWords.append(word)
|
|
|
|
|
|
|
|
|
|
|
|
def calcKey(startDate, endDate, keyword, pos):
|
|
|
|
hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \
|
|
|
|
str(hex(hash(endDate) + sys.maxsize)
|
|
|
|
[2:]) + str(hex(hash(keyword) + sys.maxsize)[2:])
|
|
|
|
for key, val in pos.items():
|
|
|
|
hashKey += ('1' if val else '0')
|
|
|
|
return hashKey
|
|
|
|
|
|
|
|
|
|
|
|
def contentProcess(content, text):
|
|
|
|
aid = text[0]
|
|
|
|
text = text[1]
|
|
|
|
if (content['keyword'] != ''):
|
|
|
|
if (content['keyword'] not in text):
|
|
|
|
return None
|
|
|
|
cutted = filterPOS(content, aid)
|
|
|
|
sentenses = []
|
|
|
|
if (content['keyword'] != ''):
|
|
|
|
pos = [i for i, n in enumerate(cutted) if n == content['keyword']]
|
|
|
|
resultPos = []
|
|
|
|
for i in pos:
|
|
|
|
for j in range(i - 9, i + 10):
|
|
|
|
if (j >= 0 and j < len(cutted) and j not in resultPos):
|
|
|
|
resultPos.append(j)
|
|
|
|
lastPos = -1
|
|
|
|
result = []
|
|
|
|
for i in sorted(resultPos):
|
|
|
|
if (i - lastPos != 1 and lastPos != -1):
|
|
|
|
sentenses.append(result)
|
|
|
|
result = []
|
|
|
|
result.append(cutted[i])
|
|
|
|
lastPos = i
|
|
|
|
sentenses.append(result)
|
|
|
|
else:
|
|
|
|
result = []
|
|
|
|
for i in cutted:
|
|
|
|
result.append(i)
|
|
|
|
if (len(result) >= 50):
|
|
|
|
sentenses.append(result.copy())
|
|
|
|
result = []
|
|
|
|
if (result != []):
|
|
|
|
sentenses.append(result)
|
|
|
|
print(sentenses)
|
|
|
|
return sentenses
|
|
|
|
|
|
|
|
|
|
|
|
def filterPOS(content, aid):
|
|
|
|
if (prod(list(content['pos'].values())) == False):
|
|
|
|
pos = content['pos']
|
|
|
|
cuttedWithPOS = data.posseg(aid)
|
|
|
|
startTime = time()
|
|
|
|
cutted = []
|
|
|
|
for i in cuttedWithPOS:
|
|
|
|
if (i['flag'][0] == 'n' or i['flag'][0] == 'N'):
|
|
|
|
if (pos['noun']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
elif (i['flag'][0] == 'v' or (i['flag'][0] == 'V' and i['flag'] != 'Vi')):
|
|
|
|
if (pos['verb']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
elif (i['flag'] == 'Vi'):
|
|
|
|
if (pos['adj']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
elif (i['flag'] == 'ADV'):
|
|
|
|
if (pos['adv']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
elif (i['flag'] == 'r'):
|
|
|
|
if (pos['pron']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
elif (i['flag'] == 'POST' or i['flag'] == 'T'):
|
|
|
|
if (pos['aux']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
else:
|
|
|
|
if (pos['other']):
|
|
|
|
if(i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm'):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
else:
|
|
|
|
if (i['word'] == content['keyword']):
|
|
|
|
cutted.append(i['word'])
|
|
|
|
else:
|
|
|
|
cuttedWithPOS = data.posseg(aid)
|
|
|
|
cutted = [i['word'] for i in cuttedWithPOS if (
|
|
|
|
i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm')]
|
|
|
|
if('stopwords' in content):
|
|
|
|
stopwords = content['stopwords']
|
|
|
|
else:
|
|
|
|
stopwords = defaultStopWords
|
|
|
|
stopped = [i for i in cutted if i not in stopwords]
|
|
|
|
return stopped
|
|
|
|
|
|
|
|
|
|
|
|
def findKeywordFrequency(content):
|
|
|
|
startDate = datetime.strptime(
|
|
|
|
content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
|
|
|
|
endDate = datetime.strptime(
|
|
|
|
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999'
|
|
|
|
result = {
|
|
|
|
'wordCount': 0,
|
|
|
|
'postCount': 0
|
|
|
|
}
|
|
|
|
for i in postContents:
|
|
|
|
if (i['date'] > endDate or i['date'] < startDate):
|
|
|
|
continue
|
|
|
|
if (content['globKeyword'] not in i['content']):
|
|
|
|
continue
|
|
|
|
counts = len(re.findall(content['keyword'], i['content']))
|
|
|
|
if (counts > 0):
|
|
|
|
result['wordCount'] += counts
|
|
|
|
result['postCount'] += 1
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def findRange(wordList: list, text: str):
|
|
|
|
top = wordList[0]
|
|
|
|
bot = wordList[-1]
|
|
|
|
mid = wordList[1:-1]
|
|
|
|
pTop = text.find(wordList[0])
|
|
|
|
pBot = text.rfind(wordList[-1])
|
|
|
|
while (True):
|
|
|
|
topTemp = text.find(wordList[0], pTop + 1)
|
|
|
|
if (topTemp == -1):
|
|
|
|
break
|
|
|
|
skip = False
|
|
|
|
for w in mid:
|
|
|
|
if (text.find(w, topTemp, pBot) == -1):
|
|
|
|
skip = True
|
|
|
|
break
|
|
|
|
if (skip):
|
|
|
|
break
|
|
|
|
pTop = topTemp
|
|
|
|
while (True):
|
|
|
|
botTemp = text.rfind(bot, pTop, pBot)
|
|
|
|
if (botTemp == -1):
|
|
|
|
break
|
|
|
|
skip = False
|
|
|
|
for w in mid:
|
|
|
|
if (text.find(w, pTop, botTemp) == -1):
|
|
|
|
skip = True
|
|
|
|
break
|
|
|
|
if (skip):
|
|
|
|
break
|
|
|
|
pBot = botTemp
|
|
|
|
return (pTop, pBot)
|
|
|
|
|
|
|
|
|
|
|
|
def findResult(content):
|
|
|
|
timeStart = time()
|
|
|
|
key = calcKey(content['startDate'],
|
|
|
|
content["endDate"], content["keyword"], content["pos"])
|
|
|
|
startDate = datetime.strptime(
|
|
|
|
content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
|
|
|
|
endDate = datetime.strptime(
|
|
|
|
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999'
|
|
|
|
resultPath = 'data/'
|
|
|
|
counter = 0
|
|
|
|
total = len(postContents)
|
|
|
|
result = []
|
|
|
|
result.append(['id', 'text', 'count'])
|
|
|
|
filtered = []
|
|
|
|
titles = {
|
|
|
|
'info': {
|
|
|
|
'keyword': content['keyword'],
|
|
|
|
'count': 0,
|
|
|
|
'posts': 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for i in postContents:
|
|
|
|
if (i['date'] > endDate or i['date'] < startDate):
|
|
|
|
continue
|
|
|
|
counter += 1
|
|
|
|
if (content['keyword'] != ''):
|
|
|
|
if (content['keyword'] in i['content']):
|
|
|
|
filtered.append(i)
|
|
|
|
titles['info']['count'] += len(
|
|
|
|
re.findall(content['keyword'], i['content']))
|
|
|
|
|
|
|
|
else:
|
|
|
|
filtered.append(i)
|
|
|
|
titles['info']['posts'] = len(filtered)
|
|
|
|
filtered = [i for i in sorted(
|
|
|
|
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
|
|
|
|
print('到第一步為止生成花費', int(time()-timeStart), '秒')
|
|
|
|
counter = 0
|
|
|
|
total = len(filtered)
|
|
|
|
sentensesList = []
|
|
|
|
if(os.name == 'posix'):
|
|
|
|
with mp.Pool(mp.cpu_count()) as pool:
|
|
|
|
processes = pool.map_async(
|
|
|
|
partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered])
|
|
|
|
sentensesList = processes.get()
|
|
|
|
else:
|
|
|
|
sentensesList = map(partial(contentProcess, content), [
|
|
|
|
(i['aid'], i['content']) for i in filtered])
|
|
|
|
for index, i in enumerate(filtered):
|
|
|
|
counter += 1
|
|
|
|
sentenses = sentensesList[index]
|
|
|
|
if (sum([len(i) for i in sentenses]) == 0):
|
|
|
|
continue
|
|
|
|
for j in sentenses:
|
|
|
|
cut = findRange(j, i['content'])
|
|
|
|
seq = ' '.join(j)
|
|
|
|
if (seq not in titles):
|
|
|
|
titles[seq] = {
|
|
|
|
'title': i['title'],
|
|
|
|
'url': i['url'],
|
|
|
|
'pushes': i['pushes'],
|
|
|
|
'author': i['author'],
|
|
|
|
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
|
|
|
|
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
|
|
|
|
}
|
|
|
|
result.append([len(result), seq, 1000 + i['pushes']])
|
|
|
|
print('到第二步為止生成花費', int(time()-timeStart), '秒')
|
|
|
|
fileString = io.StringIO()
|
|
|
|
writer = csv.writer(fileString, delimiter='\t')
|
|
|
|
writer.writerows(result)
|
|
|
|
csvString = fileString.getvalue()
|
|
|
|
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
|
|
|
|
print('tsv', '生成花費', int(time() - timeStart), '秒')
|
|
|
|
return {
|
|
|
|
'tsv': csvString,
|
|
|
|
'json': jsonString,
|
|
|
|
'stopWords': defaultStopWords
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def loadPostContents():
|
|
|
|
global postContents
|
|
|
|
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
|
|
|
|
postContents = pickle.load(f)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
def getDefault(startDate, endDate):
|
|
|
|
global defaultDate
|
|
|
|
if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']):
|
|
|
|
loadPostContents()
|
|
|
|
defaultDate['startDate'] = startDate
|
|
|
|
defaultDate['endDate'] = endDate
|
|
|
|
print('更新預設資料')
|
|
|
|
timeStart = time()
|
|
|
|
resultPath = 'data/'
|
|
|
|
startDate = datetime.strptime(
|
|
|
|
startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000'
|
|
|
|
endDate = datetime.strptime(
|
|
|
|
endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999'
|
|
|
|
counter = 0
|
|
|
|
total = len(postContents)
|
|
|
|
result = []
|
|
|
|
result.append(['id', 'text', 'count'])
|
|
|
|
titles = {}
|
|
|
|
titles['info'] = {
|
|
|
|
'keyword': '',
|
|
|
|
'counts': 0,
|
|
|
|
'posts': 0
|
|
|
|
}
|
|
|
|
filtered = []
|
|
|
|
for i in postContents:
|
|
|
|
if (i['date'] > endDate or i['date'] < startDate):
|
|
|
|
continue
|
|
|
|
filtered.append(i)
|
|
|
|
titles['info']['posts'] = len(filtered)
|
|
|
|
filtered = [i for i in sorted(
|
|
|
|
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
|
|
|
|
counter = 0
|
|
|
|
total = len(postContents)
|
|
|
|
content = {
|
|
|
|
'keyword': '',
|
|
|
|
'pos': {
|
|
|
|
'other': True
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for i in filtered:
|
|
|
|
counter += 1
|
|
|
|
post = data[i]
|
|
|
|
sentenses = contentProcess(content, (i['aid'], i['content']))
|
|
|
|
for j in sentenses:
|
|
|
|
cut = findRange(j, i['content'])
|
|
|
|
seq = ' '.join(j)
|
|
|
|
if (seq not in titles):
|
|
|
|
titles[seq] = {
|
|
|
|
'title': i['title'],
|
|
|
|
'url': i['url'],
|
|
|
|
'pushes': i['pushes'],
|
|
|
|
'author': i['author'],
|
|
|
|
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
|
|
|
|
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
|
|
|
|
}
|
|
|
|
result.append([len(result), seq, 1000 + i['pushes']])
|
|
|
|
|
|
|
|
fileString = io.StringIO()
|
|
|
|
writer = csv.writer(fileString, delimiter='\t')
|
|
|
|
writer.writerows(result)
|
|
|
|
csvString = fileString.getvalue()
|
|
|
|
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
|
|
|
|
print('tsv', '生成花費', int(time() - timeStart), '秒')
|
|
|
|
return {
|
|
|
|
'tsv': csvString,
|
|
|
|
'json': jsonString,
|
|
|
|
'stopWords': defaultStopWords
|
|
|
|
}
|