加入ratio調整及滑鼠事件

3 years ago · 4ba0a7e901
parent 04d1eb79ba
commit 4ba0a7e901
5 changed files with 134 additions and 104 deletions
--- a/generalText.py
+++ b/generalText.py
@ -2,6 +2,7 @@ import jieba
 import csv
 import nltk
 import re
 import json
 from jieba import posseg
 from nltk import tokenize
 from langdetect import detect
@ -31,38 +32,37 @@ def processText(randId, text, stopwords):
        return ''
    lang = detect(text)
    sentenses = []
    sentenses_raw = []
    print(lang)
    if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
        splitted = re.split('。|[\n]+', text)
        print(splitted)
        cutted = []
-        for i in splitted:
+        for spl in splitted:
-            cutted.append(filterPOS(i))
+            cutted.append(filterPOS(spl))
        print(cutted)
-        for i in cutted:
+        for spl, raw in zip(cutted, splitted):
-            result = []
+            sentenses.append(' '.join(spl))
-            for j in i:
+            sentenses_raw.append(raw)
                if (j in stopwords):
                    continue
                result.append(j)
                if (len(result) >= 20):
                    sentenses.append(' '.join(result.copy()))
                    result = []
            if (result != []):
                sentenses.append(' '.join(result))
    else:
        sentenses = []
-        for sentence in tokenize.sent_tokenize(text):
+        sentenses_raw = []
-            words = sentence.lower().split(' ')
+        for sentence_raw in tokenize.sent_tokenize(text):
            words = sentence_raw.lower().split(' ')
            print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
            sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
            sentenses.append(sentence)
            sentenses_raw.append(sentence_raw)
    result = []
-    result.append(['id', 'text', 'count'])
+    for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
-    for index, sentence in enumerate(sentenses):
+        sentence, sentence_raw = raw_pair
-        result.append([index, sentence, 1000])
+        result.append({
-    with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
+            'id': index,
-        writer = csv.writer(f, delimiter='\t')
+            'text': sentence,
-        writer.writerows(result)
+            'count': 10,
-        f.close()
+            'rawtxt': sentence_raw,
-    return ('data/' + randId + '.tsv')
+        })
    with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
        json.dump(result, fp, ensure_ascii=False, indent=4)
    return ('data/' + randId + '.json')
--- a/static/js/generalText.js
+++ b/static/js/generalText.js
@ -1,6 +1,15 @@
 var tsvPath
 var stopwords = []
 const init = () => {
    $(window).on('mousemove', function (e) {
        $('#nodeTitle').css({
            left: e.pageX,
            top: e.pageY
        })
    })
 }
 function clearStopWord() {
    stopwords = []
    $('#sweContainer').html('')
@ -86,22 +95,34 @@ function submit() {
 function buildSentetree() {
    console.log("Build.")
-    var model;
+    let model;
-    var tree;
+    let tree;
-    var data;
+    let data;
-    const graph = d3.tsv(tsvPath, buildTree);
+    const graph = d3.json(tsvPath, buildTree);
    function buildTree(error, rawdata) {
        console.log(rawdata)
        const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
        console.log({ data })
        let minRatio = $('#minRatio').val()
        let maxRatio = $('#maxRatio').val()
        console.log({ minRatio, maxRatio })
        model = new SentenTree.SentenTreeBuilder()
            .tokenize(SentenTree.tokenizer.tokenizeBySpace)
            .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
-            .buildModel(data, { maxSupportRatio: 1 });
+            .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
        tree = new SentenTree.SentenTreeVis('#vis', {
            fontSize: [15, 40],
            gapBetweenGraph: 10
        });
        tree.data(model.getRenderedGraphs(2))
            .on('nodeMouseenter', (node) => {
                $('#nodeTitle').removeClass('hidden')
                $('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('<br>'))
            })
            .on('nodeMouseleave', () => {
                $('#nodeTitle').addClass('hidden')
            })
        new ResizeSensor(jQuery('#d3kitRoot'), function () {
            var scale, origin;
            scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -133,3 +154,5 @@ function countWords() {
    console.log(wordCount)
    $("#wordcount").html('字數：' + wordCount)
 }
 init()
--- a/templates/generalTxt.html
+++ b/templates/generalTxt.html
@ -9,6 +9,9 @@
 </head>
 <body>
    <div id="nodeTitle" class="nodeTitle hidden">
        <div id="nodeTitleContent">test</div>
    </div>
    <div id="stopWordEditorLayer" class="info hidden">
        <div id="stopWordEditor">
            <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
@ -39,13 +42,17 @@
    <div id='heading'>
        <h2>泛用文字視覺化工具</h2>
        <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
-        <p id='comment'>這是泛用.txt檔視覺化工具，能夠簡單處理文字檔的視覺化。</p>
+        <p id='comment'>這是泛用文字視覺化工具，能夠簡單處理文字檔的視覺化。</p>
        <p id='comment'>支援的語言：繁體中文、英文以及所有使用空格分詞的語言。</p>
        <p id='comment'>使用繁體中文Jieba斷詞器，不保證簡體中文能夠正常使用。</p>
    </div>
    <div style="margin:10px;">
        <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
            onclick="showStopwordEditor()">編輯停用詞</button>
        <label>Min Ratio</label>
        <input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
        <label>Max Ratio</label>
        <input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
    </div>
    <div id='rawText' class=''>
        <div id="wordcount">字數：0</div>