You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ptt-sententree/dataHandlerPTT.py

340 lines
11 KiB

5 years ago
import sys
import threading
import jieba
import re
import csv
import json
import _pickle as pickle
import os
import io
import jieba
import multiprocessing as mp
from time import time, sleep
from functools import partial
from numpy import prod
from jieba import posseg
from progressbar import ProgressBar
from datetime import datetime
from PTTData import PTTData
defaultDate = {
'startDate': None,
'endDate': None
}
postContents = None
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
postContents = pickle.load(f)
f.close()
defaultStopWords = []
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
for word in file.readlines():
word = word.strip()
defaultStopWords.append(word)
def calcKey(startDate, endDate, keyword, pos):
hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \
str(hex(hash(endDate) + sys.maxsize)
[2:]) + str(hex(hash(keyword) + sys.maxsize)[2:])
for key, val in pos.items():
hashKey += ('1' if val else '0')
return hashKey
def contentProcess(content, text):
aid = text[0]
text = text[1]
if (content['keyword'] != ''):
if (content['keyword'] not in text):
return None
cutted = filterPOS(content, aid)
sentenses = []
if (content['keyword'] != ''):
pos = [i for i, n in enumerate(cutted) if n == content['keyword']]
resultPos = []
for i in pos:
for j in range(i - 9, i + 10):
if (j >= 0 and j < len(cutted) and j not in resultPos):
resultPos.append(j)
lastPos = -1
result = []
for i in sorted(resultPos):
if (i - lastPos != 1 and lastPos != -1):
sentenses.append(result)
result = []
result.append(cutted[i])
lastPos = i
sentenses.append(result)
else:
result = []
for i in cutted:
result.append(i)
if (len(result) >= 50):
sentenses.append(result.copy())
result = []
if (result != []):
sentenses.append(result)
print(sentenses)
return sentenses
def filterPOS(content, aid):
if (prod(list(content['pos'].values())) == False):
pos = content['pos']
cuttedWithPOS = data.posseg(aid)
startTime = time()
cutted = []
for i in cuttedWithPOS:
if (i.flag[0] == 'n' or i.flag[0] == 'N'):
if (pos['noun']):
cutted.append(i.word)
elif (i.flag[0] == 'v' or (i.flag[0] == 'V' and i.flag != 'Vi')):
if (pos['verb']):
cutted.append(i.word)
elif (i.flag == 'Vi'):
if (pos['adj']):
cutted.append(i.word)
elif (i.flag == 'ADV'):
if (pos['adv']):
cutted.append(i.word)
elif (i.flag == 'r'):
if (pos['pron']):
cutted.append(i.word)
elif (i.flag == 'POST' or i.flag == 'T'):
if (pos['aux']):
cutted.append(i.word)
else:
if (pos['other']):
if(i.flag != 'eng' and i.flag != 'x' and i.flag != 'm'):
cutted.append(i.word)
else:
if (i.word == content['keyword']):
cutted.append(i.word)
else:
cuttedWithPOS = data.posseg(aid)
cutted = [i.word for i in cuttedWithPOS if (
i.flag != 'eng' and i.flag != 'x' and i.flag != 'm')]
if('stopwords' in content):
stopwords = content['stopwords']
else:
stopwords = defaultStopWords
stopped = [i for i in cutted if i not in stopwords]
return stopped
def findKeywordFrequency(content):
startDate = datetime.strptime(
content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999'
result = {
'wordCount': 0,
'postCount': 0
}
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
if (content['globKeyword'] not in i['content']):
continue
counts = len(re.findall(content['keyword'], i['content']))
if (counts > 0):
result['wordCount'] += counts
result['postCount'] += 1
return result
def findRange(wordList: list, text: str):
top = wordList[0]
bot = wordList[-1]
mid = wordList[1:-1]
pTop = text.find(wordList[0])
pBot = text.rfind(wordList[-1])
while (True):
topTemp = text.find(wordList[0], pTop + 1)
if (topTemp == -1):
break
skip = False
for w in mid:
if (text.find(w, topTemp, pBot) == -1):
skip = True
break
if (skip):
break
pTop = topTemp
while (True):
botTemp = text.rfind(bot, pTop, pBot)
if (botTemp == -1):
break
skip = False
for w in mid:
if (text.find(w, pTop, botTemp) == -1):
skip = True
break
if (skip):
break
pBot = botTemp
return (pTop, pBot)
def findResult(content):
timeStart = time()
key = calcKey(content['startDate'],
content["endDate"], content["keyword"], content["pos"])
startDate = datetime.strptime(
content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999'
resultPath = 'data/'
counter = 0
total = len(postContents)
result = []
result.append(['id', 'text', 'count'])
filtered = []
titles = {
'info': {
'keyword': content['keyword'],
'count': 0,
'posts': 0
}
}
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
counter += 1
if (content['keyword'] != ''):
if (content['keyword'] in i['content']):
filtered.append(i)
titles['info']['count'] += len(
re.findall(content['keyword'], i['content']))
else:
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
print('到第一步為止生成花費', int(time()-timeStart), '')
counter = 0
total = len(filtered)
sentensesList = []
if(os.name == 'posix'):
with mp.Pool(mp.cpu_count()) as pool:
processes = pool.map_async(
partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered])
sentensesList = processes.get()
else:
sentensesList = map(partial(contentProcess, content), [
(i['aid'], i['content']) for i in filtered])
for index, i in enumerate(filtered):
counter += 1
sentenses = sentensesList[index]
if (sum([len(i) for i in sentenses]) == 0):
continue
for j in sentenses:
cut = findRange(j, i['content'])
seq = ' '.join(j)
if (seq not in titles):
titles[seq] = {
'title': i['title'],
'url': i['url'],
'pushes': i['pushes'],
'author': i['author'],
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
}
result.append([len(result), seq, 1000 + i['pushes']])
print('到第二步為止生成花費', int(time()-timeStart), '')
fileString = io.StringIO()
writer = csv.writer(fileString, delimiter='\t')
writer.writerows(result)
csvString = fileString.getvalue()
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
print('tsv', '生成花費', int(time() - timeStart), '')
return {
'tsv': csvString,
'json': jsonString,
'stopWords': defaultStopWords
}
def loadPostContents():
global postContents
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
postContents = pickle.load(f)
f.close()
def getDefault(startDate, endDate):
global defaultDate
if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']):
5 years ago
loadPostContents()
5 years ago
defaultDate['startDate'] = startDate
defaultDate['endDate'] = endDate
print('更新預設資料')
timeStart = time()
resultPath = 'data/'
startDate = datetime.strptime(
startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999'
counter = 0
total = len(postContents)
result = []
result.append(['id', 'text', 'count'])
titles = {}
titles['info'] = {
'keyword': '',
'counts': 0,
'posts': 0
}
filtered = []
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
counter = 0
total = len(postContents)
content = {
'keyword': '',
'pos': {
'other': True
}
}
for i in filtered:
counter += 1
post = data[i]
sentenses = contentProcess(content, (i['aid'], i['content']))
for j in sentenses:
cut = findRange(j, i['content'])
seq = ' '.join(j)
if (seq not in titles):
titles[seq] = {
'title': i['title'],
'url': i['url'],
'pushes': i['pushes'],
'author': i['author'],
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
}
result.append([len(result), seq, 1000 + i['pushes']])
fileString = io.StringIO()
writer = csv.writer(fileString, delimiter='\t')
writer.writerows(result)
csvString = fileString.getvalue()
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
print('tsv', '生成花費', int(time() - timeStart), '')
return {
'tsv': csvString,
'json': jsonString,
'stopWords': defaultStopWords
}