From 4ba0a7e901a6927d19fc5f1da1839f0bc0d0802d Mon Sep 17 00:00:00 2001 From: zovjsra <110753121@nccu.edu.tw> Date: Tue, 9 May 2023 07:01:08 +0000 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5ratio=E8=AA=BF=E6=95=B4?= =?UTF-8?q?=E5=8F=8A=E6=BB=91=E9=BC=A0=E4=BA=8B=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 4 +- generalText.py | 48 ++++++------- static/js/generalText.js | 35 ++++++++-- static/js/ptt.js | 142 +++++++++++++++++++------------------- templates/generalTxt.html | 9 ++- 5 files changed, 134 insertions(+), 104 deletions(-) diff --git a/app.py b/app.py index 5586bde..b0b9a99 100755 --- a/app.py +++ b/app.py @@ -6,8 +6,8 @@ from pprint import pprint import threading import random import string -#import dataHandlerPTT as ptt -#import dataHandlerPTTPush as pttPush +# import dataHandlerPTT as ptt +# import dataHandlerPTTPush as pttPush import generalText as gen import json diff --git a/generalText.py b/generalText.py index 7c13b61..5038fd4 100644 --- a/generalText.py +++ b/generalText.py @@ -2,6 +2,7 @@ import jieba import csv import nltk import re +import json from jieba import posseg from nltk import tokenize from langdetect import detect @@ -27,42 +28,41 @@ def filterPOS(text): def processText(randId, text, stopwords): - if(text == ''): + if (text == ''): return '' lang = detect(text) sentenses = [] + sentenses_raw = [] print(lang) if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): splitted = re.split('。|[\n]+', text) print(splitted) cutted = [] - for i in splitted: - cutted.append(filterPOS(i)) + for spl in splitted: + cutted.append(filterPOS(spl)) print(cutted) - for i in cutted: - result = [] - for j in i: - if (j in stopwords): - continue - result.append(j) - if (len(result) >= 20): - sentenses.append(' '.join(result.copy())) - result = [] - if (result != []): - sentenses.append(' '.join(result)) + for spl, raw in zip(cutted, splitted): + sentenses.append(' '.join(spl)) + sentenses_raw.append(raw) else: sentenses = [] - for sentence in tokenize.sent_tokenize(text): - words = sentence.lower().split(' ') + sentenses_raw = [] + for sentence_raw in tokenize.sent_tokenize(text): + words = sentence_raw.lower().split(' ') print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentenses.append(sentence) + sentenses_raw.append(sentence_raw) result = [] - result.append(['id', 'text', 'count']) - for index, sentence in enumerate(sentenses): - result.append([index, sentence, 1000]) - with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f: - writer = csv.writer(f, delimiter='\t') - writer.writerows(result) - f.close() - return ('data/' + randId + '.tsv') + for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)): + sentence, sentence_raw = raw_pair + result.append({ + 'id': index, + 'text': sentence, + 'count': 10, + 'rawtxt': sentence_raw, + }) + + with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp: + json.dump(result, fp, ensure_ascii=False, indent=4) + return ('data/' + randId + '.json') diff --git a/static/js/generalText.js b/static/js/generalText.js index 6d6de75..2ccc48e 100644 --- a/static/js/generalText.js +++ b/static/js/generalText.js @@ -1,6 +1,15 @@ var tsvPath var stopwords = [] +const init = () => { + $(window).on('mousemove', function (e) { + $('#nodeTitle').css({ + left: e.pageX, + top: e.pageY + }) + }) +} + function clearStopWord() { stopwords = [] $('#sweContainer').html('') @@ -86,22 +95,34 @@ function submit() { function buildSentetree() { console.log("Build.") - var model; - var tree; - var data; - const graph = d3.tsv(tsvPath, buildTree); + let model; + let tree; + let data; + const graph = d3.json(tsvPath, buildTree); function buildTree(error, rawdata) { + console.log(rawdata) const data = rawdata.map(d => Object.assign({}, d, { count: +d.count })); + console.log({ data }) + let minRatio = $('#minRatio').val() + let maxRatio = $('#maxRatio').val() + console.log({ minRatio, maxRatio }) model = new SentenTree.SentenTreeBuilder() .tokenize(SentenTree.tokenizer.tokenizeBySpace) .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token)) - .buildModel(data, { maxSupportRatio: 1 }); + .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio }); tree = new SentenTree.SentenTreeVis('#vis', { fontSize: [15, 40], gapBetweenGraph: 10 }); tree.data(model.getRenderedGraphs(2)) + .on('nodeMouseenter', (node) => { + $('#nodeTitle').removeClass('hidden') + $('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('
')) + }) + .on('nodeMouseleave', () => { + $('#nodeTitle').addClass('hidden') + }) new ResizeSensor(jQuery('#d3kitRoot'), function () { var scale, origin; scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) @@ -132,4 +153,6 @@ function countWords() { let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1 console.log(wordCount) $("#wordcount").html('字數:' + wordCount) -} \ No newline at end of file +} + +init() \ No newline at end of file diff --git a/static/js/ptt.js b/static/js/ptt.js index 235a60d..1ef541c 100755 --- a/static/js/ptt.js +++ b/static/js/ptt.js @@ -17,7 +17,7 @@ function init() { type: 'POST', url: '/init', dataType: 'json', - success: function(data) { + success: function (data) { console.log(data) setDate(data.Result.startDate, data.Result.endDate) document.getElementById('keywordBox').value = data.Result.keyword @@ -39,44 +39,44 @@ function init() { buildSentetree() } }) - $(document).ready(function() { - $(window).keydown(function(event) { + $(document).ready(function () { + $(window).keydown(function (event) { if (event.keyCode == 13) { event.preventDefault() sendRequest() } }); }); - $(window).on('mousemove', function(e) { + $(window).on('mousemove', function (e) { $('#nodeTitle').css({ left: e.pageX, top: e.pageY }) }) $('#titleListContainer').hover( - function() { // Run on hover/mouseenter + function () { // Run on hover/mouseenter $(this).css('overflow', 'auto') }, - function() { // Run on mouseleave + function () { // Run on mouseleave $(this).css('overflow', 'hidden') } ) - $('#titleListLayer').click(function(e) { + $('#titleListLayer').click(function (e) { if ($('#titleListLayer').is(e.target)) { hideTitles() } }) - $('#stopWordEditorLayer').click(function(e) { + $('#stopWordEditorLayer').click(function (e) { if ($('#stopWordEditorLayer').is(e.target)) { hideStopWordEditor() } }) - $('#idfEditorLayer').click(function(e) { + $('#idfEditorLayer').click(function (e) { if ($('#idfEditorLayer').is(e.target)) { hideIdfEditor() } }) - $('#pttPageWindow').click(function(e) { + $('#pttPageWindow').click(function (e) { if ($('#pttPageWindow').is(e.target)) { hidePTTPage() } @@ -88,23 +88,23 @@ function init() { function loadTemplate(num) { templates = [{ - startDate: '2020-12-01', - endDate: '2020-12-31', - keyword: '', - mode: 1 - }, - { - startDate: '2020-01-01', - endDate: '2020-03-01', - keyword: '衛生紙', - mode: 2 - }, - { - startDate: '2020-01-11', - endDate: '2020-01-12', - keyword: '', - mode: 2 - } + startDate: '2020-12-01', + endDate: '2020-12-31', + keyword: '', + mode: 1 + }, + { + startDate: '2020-01-01', + endDate: '2020-03-01', + keyword: '衛生紙', + mode: 2 + }, + { + startDate: '2020-01-11', + endDate: '2020-01-12', + keyword: '', + mode: 2 + } ] chosenTemp = templates[num] setDate(chosenTemp.startDate, chosenTemp.endDate) @@ -127,7 +127,7 @@ function addStopWord() { } else { stopwords.push(newsw) - $('#sweContainer').append($('
  • ').attr('class', 'w3-display-container').append($('').append(newsw)).append($('').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { + $('#sweContainer').append($('
  • ').attr('class', 'w3-display-container').append($('').append(newsw)).append($('').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) { var index = $(this).parent().index() console.log(stopwords[index]) stopwords.splice(index, 1) @@ -172,14 +172,14 @@ function scrollIdfList() { function showStopwordEditor() { $(window).unbind('keydown') - $(window).keydown(function(event) { + $(window).keydown(function (event) { if (event.keyCode == 13) { addStopWord() } }) $('#sweContainer').empty() for (word of stopwords) { - $('#sweContainer').append($('
  • ').append($('').append(word)).append($('').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { + $('#sweContainer').append($('
  • ').append($('').append(word)).append($('').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) { var index = $(this).parent().index() console.log(stopwords[index]) stopwords.splice(index, 1) @@ -192,27 +192,27 @@ function showStopwordEditor() { function showIdfEditor() { $(window).unbind('keydown') - $(window).keydown(function(event) { + $(window).keydown(function (event) { if (event.keyCode == 13) { scrollIdfList() } }) $('#ieContainer').empty().append( - $('').append($('') - .append($('') - .attr('style', 'position: sticky; top: 0; background: white;') - .append('單詞')) - .append($('') - .attr('class', 'w3-center-align') - .attr('style', 'position: sticky; top: 0; background: white;') - .append('操作')) - .append($('') - .attr('class', 'w3-right-align') - .attr('style', 'position: sticky; top: 0; background: white;') - .append('單詞頻率') - ) + $('').append($('') + .append($('') + .attr('style', 'position: sticky; top: 0; background: white;') + .append('單詞')) + .append($('') + .attr('class', 'w3-center-align') + .attr('style', 'position: sticky; top: 0; background: white;') + .append('操作')) + .append($('') + .attr('class', 'w3-right-align') + .attr('style', 'position: sticky; top: 0; background: white;') + .append('單詞頻率') ) ) + ) .append($('')) for (word of Object.entries(idfTable).sort((a, b) => { return (b[1] - a[1]) }).map((a) => { return a[0] }).slice(0, 1000)) { $('#ieContainer').find('tbody') @@ -226,7 +226,7 @@ function showIdfEditor() { .append($(' + + + +
    字數:0