From 4ba0a7e901a6927d19fc5f1da1839f0bc0d0802d Mon Sep 17 00:00:00 2001
From: zovjsra <110753121@nccu.edu.tw>
Date: Tue, 9 May 2023 07:01:08 +0000
Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5ratio=E8=AA=BF=E6=95=B4?=
 =?UTF-8?q?=E5=8F=8A=E6=BB=91=E9=BC=A0=E4=BA=8B=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app.py                    |   4 +-
 generalText.py            |  48 ++++++-------
 static/js/generalText.js  |  35 ++++++++--
 static/js/ptt.js          | 142 +++++++++++++++++++-------------------
 templates/generalTxt.html |   9 ++-
 5 files changed, 134 insertions(+), 104 deletions(-)

diff --git a/app.py b/app.py
index 5586bde..b0b9a99 100755
--- a/app.py
+++ b/app.py
@@ -6,8 +6,8 @@ from pprint import pprint
 import threading
 import random
 import string
-#import dataHandlerPTT as ptt
-#import dataHandlerPTTPush as pttPush
+# import dataHandlerPTT as ptt
+# import dataHandlerPTTPush as pttPush
 import generalText as gen
 import json
 
diff --git a/generalText.py b/generalText.py
index 7c13b61..5038fd4 100644
--- a/generalText.py
+++ b/generalText.py
@@ -2,6 +2,7 @@ import jieba
 import csv
 import nltk
 import re
+import json
 from jieba import posseg
 from nltk import tokenize
 from langdetect import detect
@@ -27,42 +28,41 @@ def filterPOS(text):
 
 
 def processText(randId, text, stopwords):
-    if(text == ''):
+    if (text == ''):
         return ''
     lang = detect(text)
     sentenses = []
+    sentenses_raw = []
     print(lang)
     if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
         splitted = re.split('。|[\n]+', text)
         print(splitted)
         cutted = []
-        for i in splitted:
-            cutted.append(filterPOS(i))
+        for spl in splitted:
+            cutted.append(filterPOS(spl))
         print(cutted)
-        for i in cutted:
-            result = []
-            for j in i:
-                if (j in stopwords):
-                    continue
-                result.append(j)
-                if (len(result) >= 20):
-                    sentenses.append(' '.join(result.copy()))
-                    result = []
-            if (result != []):
-                sentenses.append(' '.join(result))
+        for spl, raw in zip(cutted, splitted):
+            sentenses.append(' '.join(spl))
+            sentenses_raw.append(raw)
     else:
         sentenses = []
-        for sentence in tokenize.sent_tokenize(text):
-            words = sentence.lower().split(' ')
+        sentenses_raw = []
+        for sentence_raw in tokenize.sent_tokenize(text):
+            words = sentence_raw.lower().split(' ')
             print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
             sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
             sentenses.append(sentence)
+            sentenses_raw.append(sentence_raw)
     result = []
-    result.append(['id', 'text', 'count'])
-    for index, sentence in enumerate(sentenses):
-        result.append([index, sentence, 1000])
-    with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
-        writer = csv.writer(f, delimiter='\t')
-        writer.writerows(result)
-        f.close()
-    return ('data/' + randId + '.tsv')
+    for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
+        sentence, sentence_raw = raw_pair
+        result.append({
+            'id': index,
+            'text': sentence,
+            'count': 10,
+            'rawtxt': sentence_raw,
+        })
+
+    with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
+        json.dump(result, fp, ensure_ascii=False, indent=4)
+    return ('data/' + randId + '.json')
diff --git a/static/js/generalText.js b/static/js/generalText.js
index 6d6de75..2ccc48e 100644
--- a/static/js/generalText.js
+++ b/static/js/generalText.js
@@ -1,6 +1,15 @@
 var tsvPath
 var stopwords = []
 
+const init = () => {
+    $(window).on('mousemove', function (e) {
+        $('#nodeTitle').css({
+            left: e.pageX,
+            top: e.pageY
+        })
+    })
+}
+
 function clearStopWord() {
     stopwords = []
     $('#sweContainer').html('')
@@ -86,22 +95,34 @@ function submit() {
 
 function buildSentetree() {
     console.log("Build.")
-    var model;
-    var tree;
-    var data;
-    const graph = d3.tsv(tsvPath, buildTree);
+    let model;
+    let tree;
+    let data;
+    const graph = d3.json(tsvPath, buildTree);
 
     function buildTree(error, rawdata) {
+        console.log(rawdata)
         const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
+        console.log({ data })
+        let minRatio = $('#minRatio').val()
+        let maxRatio = $('#maxRatio').val()
+        console.log({ minRatio, maxRatio })
         model = new SentenTree.SentenTreeBuilder()
             .tokenize(SentenTree.tokenizer.tokenizeBySpace)
             .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
-            .buildModel(data, { maxSupportRatio: 1 });
+            .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
         tree = new SentenTree.SentenTreeVis('#vis', {
             fontSize: [15, 40],
             gapBetweenGraph: 10
         });
         tree.data(model.getRenderedGraphs(2))
+            .on('nodeMouseenter', (node) => {
+                $('#nodeTitle').removeClass('hidden')
+                $('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('<br>'))
+            })
+            .on('nodeMouseleave', () => {
+                $('#nodeTitle').addClass('hidden')
+            })
         new ResizeSensor(jQuery('#d3kitRoot'), function () {
             var scale, origin;
             scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@@ -132,4 +153,6 @@ function countWords() {
     let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1
     console.log(wordCount)
     $("#wordcount").html('字數：' + wordCount)
-}
\ No newline at end of file
+}
+
+init()
\ No newline at end of file
diff --git a/static/js/ptt.js b/static/js/ptt.js
index 235a60d..1ef541c 100755
--- a/static/js/ptt.js
+++ b/static/js/ptt.js
@@ -17,7 +17,7 @@ function init() {
         type: 'POST',
         url: '/init',
         dataType: 'json',
-        success: function(data) {
+        success: function (data) {
             console.log(data)
             setDate(data.Result.startDate, data.Result.endDate)
             document.getElementById('keywordBox').value = data.Result.keyword
@@ -39,44 +39,44 @@ function init() {
             buildSentetree()
         }
     })
-    $(document).ready(function() {
-        $(window).keydown(function(event) {
+    $(document).ready(function () {
+        $(window).keydown(function (event) {
             if (event.keyCode == 13) {
                 event.preventDefault()
                 sendRequest()
             }
         });
     });
-    $(window).on('mousemove', function(e) {
+    $(window).on('mousemove', function (e) {
         $('#nodeTitle').css({
             left: e.pageX,
             top: e.pageY
         })
     })
     $('#titleListContainer').hover(
-        function() { // Run on hover/mouseenter
+        function () { // Run on hover/mouseenter
             $(this).css('overflow', 'auto')
         },
-        function() { // Run on mouseleave
+        function () { // Run on mouseleave
             $(this).css('overflow', 'hidden')
         }
     )
-    $('#titleListLayer').click(function(e) {
+    $('#titleListLayer').click(function (e) {
         if ($('#titleListLayer').is(e.target)) {
             hideTitles()
         }
     })
-    $('#stopWordEditorLayer').click(function(e) {
+    $('#stopWordEditorLayer').click(function (e) {
         if ($('#stopWordEditorLayer').is(e.target)) {
             hideStopWordEditor()
         }
     })
-    $('#idfEditorLayer').click(function(e) {
+    $('#idfEditorLayer').click(function (e) {
         if ($('#idfEditorLayer').is(e.target)) {
             hideIdfEditor()
         }
     })
-    $('#pttPageWindow').click(function(e) {
+    $('#pttPageWindow').click(function (e) {
         if ($('#pttPageWindow').is(e.target)) {
             hidePTTPage()
         }
@@ -88,23 +88,23 @@ function init() {
 
 function loadTemplate(num) {
     templates = [{
-            startDate: '2020-12-01',
-            endDate: '2020-12-31',
-            keyword: '',
-            mode: 1
-        },
-        {
-            startDate: '2020-01-01',
-            endDate: '2020-03-01',
-            keyword: '衛生紙',
-            mode: 2
-        },
-        {
-            startDate: '2020-01-11',
-            endDate: '2020-01-12',
-            keyword: '',
-            mode: 2
-        }
+        startDate: '2020-12-01',
+        endDate: '2020-12-31',
+        keyword: '',
+        mode: 1
+    },
+    {
+        startDate: '2020-01-01',
+        endDate: '2020-03-01',
+        keyword: '衛生紙',
+        mode: 2
+    },
+    {
+        startDate: '2020-01-11',
+        endDate: '2020-01-12',
+        keyword: '',
+        mode: 2
+    }
     ]
     chosenTemp = templates[num]
     setDate(chosenTemp.startDate, chosenTemp.endDate)
@@ -127,7 +127,7 @@ function addStopWord() {
 
             } else {
                 stopwords.push(newsw)
-                $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) {
+                $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
                     var index = $(this).parent().index()
                     console.log(stopwords[index])
                     stopwords.splice(index, 1)
@@ -172,14 +172,14 @@ function scrollIdfList() {
 
 function showStopwordEditor() {
     $(window).unbind('keydown')
-    $(window).keydown(function(event) {
+    $(window).keydown(function (event) {
         if (event.keyCode == 13) {
             addStopWord()
         }
     })
     $('#sweContainer').empty()
     for (word of stopwords) {
-        $('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) {
+        $('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
             var index = $(this).parent().index()
             console.log(stopwords[index])
             stopwords.splice(index, 1)
@@ -192,27 +192,27 @@ function showStopwordEditor() {
 
 function showIdfEditor() {
     $(window).unbind('keydown')
-    $(window).keydown(function(event) {
+    $(window).keydown(function (event) {
         if (event.keyCode == 13) {
             scrollIdfList()
         }
     })
     $('#ieContainer').empty().append(
-            $('<thead>').append($('<tr>')
-                .append($('<th>')
-                    .attr('style', 'position: sticky; top: 0; background: white;')
-                    .append('單詞'))
-                .append($('<th>')
-                    .attr('class', 'w3-center-align')
-                    .attr('style', 'position: sticky; top: 0; background: white;')
-                    .append('操作'))
-                .append($('<th>')
-                    .attr('class', 'w3-right-align')
-                    .attr('style', 'position: sticky; top: 0; background: white;')
-                    .append('單詞頻率')
-                )
+        $('<thead>').append($('<tr>')
+            .append($('<th>')
+                .attr('style', 'position: sticky; top: 0; background: white;')
+                .append('單詞'))
+            .append($('<th>')
+                .attr('class', 'w3-center-align')
+                .attr('style', 'position: sticky; top: 0; background: white;')
+                .append('操作'))
+            .append($('<th>')
+                .attr('class', 'w3-right-align')
+                .attr('style', 'position: sticky; top: 0; background: white;')
+                .append('單詞頻率')
             )
         )
+    )
         .append($('<tbody>'))
     for (word of Object.entries(idfTable).sort((a, b) => { return (b[1] - a[1]) }).map((a) => { return a[0] }).slice(0, 1000)) {
         $('#ieContainer').find('tbody')
@@ -226,7 +226,7 @@ function showIdfEditor() {
                     .append($('<button>')
                         .attr('class', 'general-button')
                         .html('設為最小')
-                        .click(function() {
+                        .click(function () {
                             $(this)
                                 .parent()
                                 .parent()
@@ -236,7 +236,7 @@ function showIdfEditor() {
                     .append($('<button>')
                         .attr('class', 'general-button')
                         .html('設為最大')
-                        .click(function() {
+                        .click(function () {
                             $(this)
                                 .parent()
                                 .parent()
@@ -248,7 +248,7 @@ function showIdfEditor() {
                     .append($('<button>')
                         .attr('class', 'general-button')
                         .html('重設')
-                        .click(function() {
+                        .click(function () {
                             var _word = $($(this)
                                 .parent()
                                 .parent()
@@ -275,7 +275,7 @@ function showIdfEditor() {
 
 function hideStopWordEditor() {
     $(window).unbind('keydown')
-    $(window).keydown(function(event) {
+    $(window).keydown(function (event) {
         if (event.keyCode == 13) {
             event.preventDefault()
             sendRequest()
@@ -286,7 +286,7 @@ function hideStopWordEditor() {
 
 function hideIdfEditor() {
     $(window).unbind('keydown')
-    $(window).keydown(function(event) {
+    $(window).keydown(function (event) {
         if (event.keyCode == 13) {
             event.preventDefault()
             sendRequest()
@@ -369,7 +369,7 @@ function sendRequest() {
         url: '/addRequest',
         data: content,
         contentType: 'application/json',
-        success: function(data) {
+        success: function (data) {
             console.log(data)
             changeGraph(data.Result)
         }
@@ -460,16 +460,16 @@ function buildSentetree() {
                         })
                     })
                 } else {
-                    seqList = node.data.seq.DBs.map(function(n) {
+                    seqList = node.data.seq.DBs.map(function (n) {
                         return n.rawText
                     })
                 }
                 titleList = []
                 for (s of seqList) {
                     titleTemp = wordTitleList[s]
-                    if ((titleList.map(function(n) {
-                            return n.title
-                        })).indexOf(titleTemp.title) == -1) {
+                    if ((titleList.map(function (n) {
+                        return n.title
+                    })).indexOf(titleTemp.title) == -1) {
                         titleList.push(titleTemp)
                     }
                 }
@@ -508,7 +508,7 @@ function buildSentetree() {
                         globKeyword: globKeyword
                     }),
                     contentType: 'application/json',
-                    success: function(data) {
+                    success: function (data) {
                         console.log(data)
                         $('#titleListKeywordInfo').html('單詞出現次數：' + data.Result.wordCount + ', 單詞出現的文章數：' + data.Result.postCount + ', 單詞頻率：' + (data.Result.postCount * 100 / totalPosts).toFixed(2) + '%')
                     }
@@ -517,26 +517,26 @@ function buildSentetree() {
                 for (i of titleList) {
                     $('#titleListContainer').append(
                         $('<li>').attr('class', 'w3-panel')
-                        .css('cursor', 'pointer').append(
-                            $('<p>').attr('target', '_blank').append(
-                                $('<h4>').html(i.title)
-                            ).append(
-                                $('<span>').attr('style', 'margin: 0px 10px').html(i.author)
-                            ).append(
-                                $('<span>').attr('style', 'margin: 0px 10px').html(i.date)
-                            ).append(
-                                $('<span>').attr('style', 'margin: 0px 10px').html('推文數：' + i.pushes)
-                            )
-                        ).click(function() {
-                            let indx = $(this).index()
-                            showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc'))
-                        })
+                            .css('cursor', 'pointer').append(
+                                $('<p>').attr('target', '_blank').append(
+                                    $('<h4>').html(i.title)
+                                ).append(
+                                    $('<span>').attr('style', 'margin: 0px 10px').html(i.author)
+                                ).append(
+                                    $('<span>').attr('style', 'margin: 0px 10px').html(i.date)
+                                ).append(
+                                    $('<span>').attr('style', 'margin: 0px 10px').html('推文數：' + i.pushes)
+                                )
+                            ).click(function () {
+                                let indx = $(this).index()
+                                showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc'))
+                            })
                     )
                 }
             })
             .on('nodeMouseenter', node => {
                 console.log(node)
-                titles = node.data.topEntries.map(function(x) {
+                titles = node.data.topEntries.map(function (x) {
                     return wordTitleList[x.rawText]
                 })
                 console.log(titles)
@@ -564,7 +564,7 @@ function buildSentetree() {
             }).on('linkMouseenter', link => {
                 console.log(link)
             })
-        new ResizeSensor(jQuery('#d3kitRoot'), function() {
+        new ResizeSensor(jQuery('#d3kitRoot'), function () {
             var scale, origin;
             scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
 
diff --git a/templates/generalTxt.html b/templates/generalTxt.html
index e0c2c24..dc4f66b 100644
--- a/templates/generalTxt.html
+++ b/templates/generalTxt.html
@@ -9,6 +9,9 @@
 </head>
 
 <body>
+    <div id="nodeTitle" class="nodeTitle hidden">
+        <div id="nodeTitleContent">test</div>
+    </div>
     <div id="stopWordEditorLayer" class="info hidden">
         <div id="stopWordEditor">
             <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
@@ -39,13 +42,17 @@
     <div id='heading'>
         <h2>泛用文字視覺化工具</h2>
         <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
-        <p id='comment'>這是泛用.txt檔視覺化工具，能夠簡單處理文字檔的視覺化。</p>
+        <p id='comment'>這是泛用文字視覺化工具，能夠簡單處理文字檔的視覺化。</p>
         <p id='comment'>支援的語言：繁體中文、英文以及所有使用空格分詞的語言。</p>
         <p id='comment'>使用繁體中文Jieba斷詞器，不保證簡體中文能夠正常使用。</p>
     </div>
     <div style="margin:10px;">
         <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
             onclick="showStopwordEditor()">編輯停用詞</button>
+        <label>Min Ratio</label>
+        <input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
+        <label>Max Ratio</label>
+        <input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
     </div>
     <div id='rawText' class=''>
         <div id="wordcount">字數：0</div>