You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ptt-sententree/tfIdfTest.py

40 lines
1.2 KiB

from progressbar import ProgressBar
from PTTData import PTTData
from pprint import pprint
import multiprocess.pool as mp
import json
import _pickle as pickle
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f:
postContents = pickle.load(f)
f.close()
if __name__ == "__main__":
posseged = []
tfTable = {}
idfTable = {}
tfidf = {}
for p in ProgressBar(max_value=len(data.getIndex()), redirect_stdout=True)(data.posseg(data.getIndex())):
wordSet = set()
for w in p['content']:
if (w['flag'] != 'eng' and w['flag'] != 'x' and w['flag'] != 'm'):
if (w['word'] not in tfTable):
tfTable[w['word']] = 0
tfTable[w['word']] += 1
wordSet.add(w['word'])
for w in wordSet:
if (w not in idfTable):
idfTable[w] = 0
idfTable[w] += 1
for w in tfTable:
tfidf[w] = tfTable[w] / idfTable[w]
pprint(sorted(idfTable.items(), key=lambda x: x[1], reverse=True)[:100])
with open('idfTable.json', 'w') as f:
json.dump(idfTable, f, ensure_ascii=False, indent=4)
f.close()
pass