You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
40 lines
1.2 KiB
40 lines
1.2 KiB
from progressbar import ProgressBar
|
|
from PTTData import PTTData
|
|
from pprint import pprint
|
|
import multiprocess.pool as mp
|
|
import json
|
|
import _pickle as pickle
|
|
|
|
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
|
|
|
|
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
|
|
postContents = pickle.load(f)
|
|
f.close()
|
|
|
|
if __name__ == "__main__":
|
|
posseged = []
|
|
tfTable = {}
|
|
idfTable = {}
|
|
tfidf = {}
|
|
for p in ProgressBar(max_value=len(data.getIndex()), redirect_stdout=True)(data.posseg(data.getIndex())):
|
|
wordSet = set()
|
|
for w in p['content']:
|
|
if (w['flag'] != 'eng' and w['flag'] != 'x' and w['flag'] != 'm'):
|
|
if (w['word'] not in tfTable):
|
|
tfTable[w['word']] = 0
|
|
tfTable[w['word']] += 1
|
|
wordSet.add(w['word'])
|
|
for w in wordSet:
|
|
if (w not in idfTable):
|
|
idfTable[w] = 0
|
|
idfTable[w] += 1
|
|
|
|
for w in tfTable:
|
|
tfidf[w] = tfTable[w] / idfTable[w]
|
|
|
|
pprint(sorted(idfTable.items(), key=lambda x: x[1], reverse=True)[:100])
|
|
with open('idfTable.json', 'w') as f:
|
|
json.dump(idfTable, f, ensure_ascii=False, indent=4)
|
|
f.close()
|
|
pass
|