You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ptt-sententree/dataHandlerPTT.py

342 lines
11 KiB

5 years ago
import sys
import threading
import jieba
import re
import csv
import json
import _pickle as pickle
import os
import io
import jieba
import multiprocessing as mp
from time import time, sleep
from functools import partial
from numpy import prod
from jieba import posseg
from progressbar import ProgressBar
from datetime import datetime
from PTTData import PTTData
defaultDate = {
'startDate': None,
'endDate': None
}
postContents = None
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
postContents = pickle.load(f)
f.close()
defaultStopWords = []
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
4 years ago
sentence_length = 100
use_push_count = False
5 years ago
with open('resource/stopWords.txt', 'r', encoding='UTF-8') as file:
for word in file.readlines():
word = word.strip()
defaultStopWords.append(word)
def calcKey(startDate, endDate, keyword, pos):
hashKey = str(hex(hash(startDate) + sys.maxsize)[2:]) + \
str(hex(hash(endDate) + sys.maxsize)
[2:]) + str(hex(hash(keyword) + sys.maxsize)[2:])
for key, val in pos.items():
hashKey += ('1' if val else '0')
return hashKey
def contentProcess(content, text):
aid = text[0]
text = text[1]
if (content['keyword'] != ''):
if (content['keyword'] not in text):
return None
cutted = filterPOS(content, aid)
sentenses = []
if (content['keyword'] != ''):
pos = [i for i, n in enumerate(cutted) if n == content['keyword']]
resultPos = []
for i in pos:
for j in range(i - 9, i + 10):
if (j >= 0 and j < len(cutted) and j not in resultPos):
resultPos.append(j)
lastPos = -1
result = []
for i in sorted(resultPos):
if (i - lastPos != 1 and lastPos != -1):
sentenses.append(result)
result = []
result.append(cutted[i])
lastPos = i
sentenses.append(result)
else:
result = []
for i in cutted:
result.append(i)
4 years ago
if (len(result) >= sentence_length):
5 years ago
sentenses.append(result.copy())
result = []
if (result != []):
sentenses.append(result)
print(sentenses)
return sentenses
def filterPOS(content, aid):
if (prod(list(content['pos'].values())) == False):
pos = content['pos']
cuttedWithPOS = data.posseg(aid)
startTime = time()
cutted = []
for i in cuttedWithPOS:
if (i['flag'][0] == 'n' or i['flag'][0] == 'N'):
5 years ago
if (pos['noun']):
cutted.append(i['word'])
elif (i['flag'][0] == 'v' or (i['flag'][0] == 'V' and i['flag'] != 'Vi')):
5 years ago
if (pos['verb']):
cutted.append(i['word'])
elif (i['flag'] == 'Vi'):
5 years ago
if (pos['adj']):
cutted.append(i['word'])
elif (i['flag'] == 'ADV'):
5 years ago
if (pos['adv']):
cutted.append(i['word'])
elif (i['flag'] == 'r'):
5 years ago
if (pos['pron']):
cutted.append(i['word'])
elif (i['flag'] == 'POST' or i['flag'] == 'T'):
5 years ago
if (pos['aux']):
cutted.append(i['word'])
5 years ago
else:
if (pos['other']):
if(i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm'):
cutted.append(i['word'])
5 years ago
else:
if (i['word'] == content['keyword']):
cutted.append(i['word'])
5 years ago
else:
cuttedWithPOS = data.posseg(aid)
cutted = [i['word'] for i in cuttedWithPOS if (
i['flag'] != 'eng' and i['flag'] != 'x' and i['flag'] != 'm')]
5 years ago
if('stopwords' in content):
stopwords = content['stopwords']
else:
stopwords = defaultStopWords
stopped = [i for i in cutted] # 不在server端刪除停用詞
5 years ago
return stopped
def findKeywordFrequency(content):
startDate = datetime.strptime(
content["startDate"], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d') + '999999'
result = {
'wordCount': 0,
'postCount': 0
}
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
if (content['globKeyword'] not in i['content']):
continue
counts = len(re.findall(content['keyword'], i['content']))
if (counts > 0):
result['wordCount'] += counts
result['postCount'] += 1
return result
def findRange(wordList: list, text: str):
top = wordList[0]
bot = wordList[-1]
mid = wordList[1:-1]
pTop = text.find(wordList[0])
pBot = text.rfind(wordList[-1])
while (True):
topTemp = text.find(wordList[0], pTop + 1)
if (topTemp == -1):
break
skip = False
for w in mid:
if (text.find(w, topTemp, pBot) == -1):
skip = True
break
if (skip):
break
pTop = topTemp
while (True):
botTemp = text.rfind(bot, pTop, pBot)
if (botTemp == -1):
break
skip = False
for w in mid:
if (text.find(w, pTop, botTemp) == -1):
skip = True
break
if (skip):
break
pBot = botTemp
return (pTop, pBot)
def findResult(content):
timeStart = time()
key = calcKey(content['startDate'],
content["endDate"], content["keyword"], content["pos"])
startDate = datetime.strptime(
content['startDate'], '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
content['endDate'], '%Y-%m-%d').strftime('%Y%m%d')+'999999'
resultPath = 'data/'
counter = 0
total = len(postContents)
result = []
result.append(['id', 'text', 'count'])
filtered = []
titles = {
'info': {
'keyword': content['keyword'],
'count': 0,
'posts': 0
}
}
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
counter += 1
if (content['keyword'] != ''):
if (content['keyword'] in i['content']):
filtered.append(i)
titles['info']['count'] += len(
re.findall(content['keyword'], i['content']))
else:
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]]
5 years ago
print('到第一步為止生成花費', int(time()-timeStart), '')
counter = 0
total = len(filtered)
sentensesList = []
if(os.name == 'posix'):
with mp.Pool(mp.cpu_count()) as pool:
processes = pool.map_async(
partial(contentProcess, content), [(i['aid'], i['content']) for i in filtered])
sentensesList = processes.get()
else:
sentensesList = map(partial(contentProcess, content), [
(i['aid'], i['content']) for i in filtered])
for index, i in enumerate(filtered):
counter += 1
sentenses = sentensesList[index]
if (sum([len(i) for i in sentenses]) == 0):
continue
for j in sentenses:
cut = findRange(j, i['content'])
seq = ' '.join(j)
if (seq not in titles):
titles[seq] = {
'title': i['title'],
'url': i['url'],
'pushes': i['pushes'],
'author': i['author'],
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
}
result.append([len(result), seq, 1000 + i['pushes']])
print('到第二步為止生成花費', int(time()-timeStart), '')
fileString = io.StringIO()
writer = csv.writer(fileString, delimiter='\t')
writer.writerows(result)
csvString = fileString.getvalue()
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
print('tsv', '生成花費', int(time() - timeStart), '')
return {
'tsv': csvString,
'json': jsonString,
'stopWords': defaultStopWords
}
def loadPostContents():
global postContents
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
postContents = pickle.load(f)
f.close()
def getDefault(startDate, endDate):
global defaultDate
if (startDate != defaultDate['startDate'] or endDate != defaultDate['endDate']):
5 years ago
loadPostContents()
5 years ago
defaultDate['startDate'] = startDate
defaultDate['endDate'] = endDate
print('更新預設資料')
timeStart = time()
resultPath = 'data/'
startDate = datetime.strptime(
startDate, '%Y-%m-%d').strftime('%Y%m%d') + '000000'
endDate = datetime.strptime(
endDate, '%Y-%m-%d').strftime('%Y%m%d')+'999999'
counter = 0
total = len(postContents)
result = []
result.append(['id', 'text', 'count'])
titles = {}
titles['info'] = {
'keyword': '',
'counts': 0,
'posts': 0
}
filtered = []
for i in postContents:
if (i['date'] > endDate or i['date'] < startDate):
continue
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:30]]
5 years ago
counter = 0
total = len(postContents)
content = {
'keyword': '',
'pos': {
'other': True
}
}
for i in filtered:
counter += 1
post = data[i]
sentenses = contentProcess(content, (i['aid'], i['content']))
for j in sentenses:
cut = findRange(j, i['content'])
seq = ' '.join(j)
if (seq not in titles):
titles[seq] = {
'title': i['title'],
'url': i['url'],
'pushes': i['pushes'],
'author': i['author'],
'date': datetime.strptime(i['date'], '%Y%m%d%H%M%S').strftime("%a %b %d %H:%M:%S %Y"),
'part': i['content'][max(0, cut[0] - 20): min(len(i['content']), cut[1])].replace('\n', '')
}
4 years ago
result.append([len(result), seq, (1000 + i['pushes'])if use_push_count else 3000])
5 years ago
fileString = io.StringIO()
writer = csv.writer(fileString, delimiter='\t')
writer.writerows(result)
csvString = fileString.getvalue()
jsonString = json.dumps(titles, ensure_ascii=False, indent=4)
print('tsv', '生成花費', int(time() - timeStart), '')
return {
'tsv': csvString,
'json': jsonString,
'stopWords': defaultStopWords
}