from progressbar import ProgressBar from PTTData import PTTData from pprint import pprint import multiprocess.pool as mp import json import _pickle as pickle data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData') with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') as f: postContents = pickle.load(f) f.close() if __name__ == "__main__": posseged = [] tfTable = {} idfTable = {} tfidf = {} for p in ProgressBar(max_value=len(data.getIndex()), redirect_stdout=True)(data.posseg(data.getIndex())): wordSet = set() for w in p['content']: if (w['flag'] != 'eng' and w['flag'] != 'x' and w['flag'] != 'm'): if (w['word'] not in tfTable): tfTable[w['word']] = 0 tfTable[w['word']] += 1 wordSet.add(w['word']) for w in wordSet: if (w not in idfTable): idfTable[w] = 0 idfTable[w] += 1 for w in tfTable: tfidf[w] = tfTable[w] / idfTable[w] pprint(sorted(idfTable.items(), key=lambda x: x[1], reverse=True)[:100]) with open('idfTable.json', 'w') as f: json.dump(idfTable, f, ensure_ascii=False, indent=4) f.close() pass