change app.py

加入說明
修改用詞
16 changed files with 266 additions and 134 deletions
--- a/10
+++ b/10
@ -0,0 +1,10 @@
+FROM python:3.8.16-bullseye
+
+ADD . /app
+
+RUN pip install flask flask-compress jieba numpy progressbar2 nltk langdetect
+
+WORKDIR /app
+RUN mkdir data
+
+ENTRYPOINT [ "/usr/local/bin/python", "app.py" ]
--- a/README.md
+++ b/README.md
@ -1 +1,3 @@
 # ptt-sententree
+## Reference
+https://github.com/twitter/SentenTree
--- a/pycache/dataHandler.cpython-37.pyc
+++ b/pycache/dataHandler.cpython-37.pyc
--- a/pycache/dataHandler.cpython-38.pyc
+++ b/pycache/dataHandler.cpython-38.pyc
--- a/pycache/dataHandlerPTT.cpython-37.pyc
+++ b/pycache/dataHandlerPTT.cpython-37.pyc
--- a/pycache/generalText.cpython-37.pyc
+++ b/pycache/generalText.cpython-37.pyc
--- a/app.py
+++ b/app.py
@ -6,8 +6,8 @@ from pprint import pprint
 import threading
 import random
 import string
-import dataHandlerPTT as ptt
-import dataHandlerPTTPush as pttPush
+# import dataHandlerPTT as ptt
+# import dataHandlerPTTPush as pttPush
 import generalText as gen
 import json

@ -33,9 +33,9 @@ def eventStream(eventQueue):
        yield "event:{event}\n{data}\n\n".format(event=eventNode['event'], data=data)


-@app.route('/data/<path:path>')
+@app.route('/img/<path:path>')
 def send_data(path):
-    return send_from_directory('data', path)
+    return send_from_directory('resource/img', path)


@app.route('/generalTxt')
@ -57,7 +57,7 @@ def generalText_addText():

@app.route('/')
 def index():
-    return redirect('/ptt')
+    return redirect('/generalTxt')


@app.route('/ptt_push')
@ -151,10 +151,13 @@ def send_resource(path):
    return send_from_directory('resource', path)


-@app.route("/dcard_dev")
-def dcard_dev():
-    return render_template('dcard.html', title='DCard Sentntree 測試版')
+@app.route('/generaltxt/help')
+def generaltxt_help():
+    return render_template('generaltxt_help.html', title="使用說明")

+@app.route('/data/<path:path>')
+def get_data(path):
+    return send_from_directory('data', path)

 if __name__ == "__main__":
-    app.run(debug=True, port=4998, host='0.0.0.0', threaded=True)
+    app.run(debug=True, port=4980, host='0.0.0.0', threaded=False)
--- a/dataHandlerPTT.py
+++ b/dataHandlerPTT.py
@ -15,7 +15,7 @@ from numpy import prod
 from jieba import posseg
 from progressbar import ProgressBar
 from datetime import datetime
-from PTTData import PTTData
+#from PTTData import PTTData


 defaultDate = {
@ -29,7 +29,7 @@ with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') a
    f.close()

 defaultStopWords = []
-data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
+#data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')

 sentence_length = 100
 use_push_count = False
--- a/generalText.py
+++ b/generalText.py
@ -2,6 +2,7 @@ import jieba
 import csv
 import nltk
 import re
+import json
 from jieba import posseg
 from nltk import tokenize
 from langdetect import detect
@ -31,38 +32,37 @@ def processText(randId, text, stopwords):
        return ''
    lang = detect(text)
    sentenses = []
+    sentenses_raw = []
    print(lang)
    if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
        splitted = re.split('。|[\n]+', text)
        print(splitted)
        cutted = []
-        for i in splitted:
-            cutted.append(filterPOS(i))
+        for spl in splitted:
+            cutted.append(filterPOS(spl))
        print(cutted)
-        for i in cutted:
-            result = []
-            for j in i:
-                if (j in stopwords):
-                    continue
-                result.append(j)
-                if (len(result) >= 20):
-                    sentenses.append(' '.join(result.copy()))
-                    result = []
-            if (result != []):
-                sentenses.append(' '.join(result))
+        for spl, raw in zip(cutted, splitted):
+            sentenses.append(' '.join(spl))
+            sentenses_raw.append(raw)
    else:
        sentenses = []
-        for sentence in tokenize.sent_tokenize(text):
-            words = sentence.lower().split(' ')
+        sentenses_raw = []
+        for sentence_raw in tokenize.sent_tokenize(text):
+            words = sentence_raw.lower().split(' ')
            print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
            sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
            sentenses.append(sentence)
+            sentenses_raw.append(sentence_raw)
    result = []
-    result.append(['id', 'text', 'count'])
-    for index, sentence in enumerate(sentenses):
-        result.append([index, sentence, 1000])
-    with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
-        writer = csv.writer(f, delimiter='\t')
-        writer.writerows(result)
-        f.close()
-    return ('data/' + randId + '.tsv')
+    for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
+        sentence, sentence_raw = raw_pair
+        result.append({
+            'id': index,
+            'text': sentence,
+            'count': 10,
+            'rawtxt': sentence_raw,
+        })
+
+    with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
+        json.dump(result, fp, ensure_ascii=False, indent=4)
+    return ('data/' + randId + '.json')
--- a/resource/img/general_txt_help_g01.png
+++ b/resource/img/general_txt_help_g01.png
--- a/resource/img/general_txt_help_g02.png
+++ b/resource/img/general_txt_help_g02.png
--- a/static/css/main.css
+++ b/static/css/main.css
@ -186,6 +186,7 @@ li a {
    from {
        opacity: 0;
    }
+
    to {
        opacity: 1;
    }
@ -202,6 +203,7 @@ li a {
    from {
        opacity: 1;
    }
+
    to {
        opacity: 0;
    }
@ -260,7 +262,7 @@ li a {

 #vis {
    display: inline-block;
-    background-color: aliceblue;
+    background-color: transparent;
    position: relative;
    border-radius: 30px;
    resize: both;
--- a/static/js/generalText.js
+++ b/static/js/generalText.js
@ -1,6 +1,33 @@
 var tsvPath
 var stopwords = []

+const init = () => {
+    $(window).on('mousemove', (e) => {
+        $('#nodeTitle').css({
+            left: e.pageX,
+            top: e.pageY
+        })
+    })
+    $('#minRatioLabel').on('mouseenter', () => {
+        $('#nodeTitle').removeClass('hidden')
+        $('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最小值，小於該值不會被演算法選擇')
+    }).on('mouseleave', () => {
+        $('#nodeTitle').toggleClass('hidden')
+    })
+    $('#maxRatioLabel').on('mouseenter', () => {
+        $('#nodeTitle').removeClass('hidden')
+        $('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最大值，大於該值不會被演算法選擇')
+    }).on('mouseleave', () => {
+        $('#nodeTitle').toggleClass('hidden')
+    })
+    $('#wordcount').on('mouseenter', () => {
+        $('#nodeTitle').removeClass('hidden')
+        $('#nodeTitleContent').html('僅計算中文字的字數')
+    }).on('mouseleave', () => {
+        $('#nodeTitle').toggleClass('hidden')
+    })
+}
+
 function clearStopWord() {
    stopwords = []
    $('#sweContainer').html('')
@ -86,22 +113,41 @@ function submit() {

 function buildSentetree() {
    console.log("Build.")
-    var model;
-    var tree;
-    var data;
-    const graph = d3.tsv(tsvPath, buildTree);
+    let model;
+    let tree;
+    let data;
+    const graph = d3.json(tsvPath, buildTree);

    function buildTree(error, rawdata) {
+        console.log(rawdata)
        const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
+        console.log({ data })
+        let minRatio = $('#minRatio').val()
+        let maxRatio = $('#maxRatio').val()
+        console.log({ minRatio, maxRatio })
        model = new SentenTree.SentenTreeBuilder()
            .tokenize(SentenTree.tokenizer.tokenizeBySpace)
            .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
-            .buildModel(data, { maxSupportRatio: 1 });
+            .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
        tree = new SentenTree.SentenTreeVis('#vis', {
            fontSize: [15, 40],
            gapBetweenGraph: 10
        });
        tree.data(model.getRenderedGraphs(2))
+            .on('nodeMouseenter', (node) => {
+                console.log(node)
+                $('#nodeTitle').removeClass('hidden')
+                $('#nodeTitleContent').html('<ul>' + node.data.topEntries.map((n) => "<li>" + data[n.id].rawtxt + "</li>").join('') + "</ul>")
+            })
+            .on('nodeMouseleave', () => {
+                $('#nodeTitle').addClass('hidden')
+            })
+            .on('linkMouseenter', (node) => {
+                $('#nodeTitle').removeClass('hidden')
+                $('#nodeTitleContent').html('出現次數：' + (node.freq / 10))
+            }).on('linkMouseleave', () => {
+                $('#nodeTitle').addClass('hidden')
+            })
        new ResizeSensor(jQuery('#d3kitRoot'), function () {
            var scale, origin;
            scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -126,3 +172,12 @@ function switchMessageBox() {
        $('#toggleTextBox').html('隱藏文字輸入區')
    }
 }
+
+function countWords() {
+    text = $("#rawTextBox").val()
+    let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1
+    console.log(wordCount)
+    $("#wordcount").html('字數：' + wordCount)
+}
+
+init()
--- a/templates/generalTxt.html
+++ b/templates/generalTxt.html
@ -9,44 +9,58 @@
 </head>

 <body>
+    <div id="nodeTitle" class="nodeTitle hidden">
+        <div id="nodeTitleContent">test</div>
+    </div>
    <div id="stopWordEditorLayer" class="info hidden">
        <div id="stopWordEditor">
            <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
            <ul id="sweContainer" class="w3-ul w3-hoverable" style="margin-bottom: 10px;"></ul>
            <div>
-                <input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord" placeholder="新增停用詞（以空白隔開）">
-                <button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button>
+                <input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord"
+                    placeholder="新增停用詞（以空白隔開）">
+                <button class="general-button w3-right" type="button" id="confirm"
+                    style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button>
            </div>
            <div id="sweButtons" style="margin: 20px 0px;">
-                <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="hideStopWordEditor(); submit()">確認</button>
-                <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button>
-                <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button>
-                <button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button>
+                <button class="general-button" type="button" id="confirm"
+                    style="background-color: #379; margin: 0px 10px"
+                    onclick="hideStopWordEditor(); submit()">確認</button>
+                <button class="general-button" type="button" id="confirm"
+                    style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button>
+                <button class="general-button" type="button" id="confirm"
+                    style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button>
+                <button class="general-button w3-right" type="button" id="confirm"
+                    style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button>
            </div>
        </div>
    </div>
    <div class='w3-bar w3-teal'>
-        <button class="w3-button w3-teal" type="button" onclick="location.href='/ptt'">PTT Sententree</button>
-        <button class="w3-button" type="button" onclick="location.href='/ptt_push'">推文Sententree</button>
-        <button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'" style="color: darkseagreen;">泛用文字視覺化工具</button>
+        <button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'"
+            style="color: darkseagreen;">泛用文字視覺化工具</button>
    </div>
    <div id='heading'>
        <h2>泛用文字視覺化工具</h2>
        <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
-        <p id='comment'>這是泛用.txt檔視覺化工具，能夠簡單處理文字檔的視覺化。</p>
-        <p id='comment'>支援的語言：繁體中文、英文以及所有使用空格分詞的語言。</p>
-        <p id='comment'>使用繁體中文Jieba斷詞器，不保證簡體中文能夠正常使用。</p>
+        <p id="comment">點此查看<a href="/generaltxt/help">使用說明</a></p>
    </div>
    <div style="margin:10px;">
-        <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" onclick="showStopwordEditor()">編輯停用詞</button>
+        <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
+            onclick="showStopwordEditor()">編輯停用詞</button>
+        <label id="minRatioLabel">Min Ratio</label>
+        <input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
+        <label id="maxRatioLabel">Max Ratio</label>
+        <input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
    </div>
    <div id='rawText' class=''>
+        <div id="wordcount">字數：0</div>
        <textarea id='rawTextBox' rows=25 placeholder="輸入要視覺化的文字
-換行為斷句"></textarea>
+換行為斷句" onchange="countWords()"></textarea>
        <button class='general-button' style='margin: 10px 0px' onclick="submit()">提交</button>
    </div>
    <div>
-        <button id='toggleTextBox' class='general-button' style='margin: 0px 10px' onclick="switchMessageBox()">隱藏文字視窗</button>
+        <button id='toggleTextBox' class='general-button' style='margin: 0px 10px'
+            onclick="switchMessageBox()">隱藏文字視窗</button>
        <div id='graph' class='hidden'>
            <div id='vis'></div>
        </div>
--- a/templates/generaltxt_help.html
+++ b/templates/generaltxt_help.html
@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <link href="/static/css/w3.css" type="text/css" rel="stylesheet">
+    <link href="/static/css/main.css" type="text/css" rel="stylesheet">
+    <title>使用說明</title>
+</head>
+
+<body>
+    <h1>使用說明</h1>
+    <h2>基本說明</h2>
+    <p>泛用文字視覺化工具能夠簡單處理文字檔的視覺化。<br>
+        支援的語言：繁體中文、英文以及所有使用空格分詞的語言。<br>
+        使用繁體中文Jieba斷詞器，不保證簡體中文能夠正常使用。</p>
+    <h2>參數</h2>
+    <p>此工具提供 <span style="color: red;">minRatio</span> 和 <span style="color: red;">maxRatio</span> 兩個參數的設定<br>
+        兩個參數代表相鄰兩個單詞（有被連接的單詞）之間的最大或最小比值<br>
+        例如：maxRatio 為 0.8 時，代表兩個相鄰的單詞出現的頻率必須小於 0.8，否則單詞就不會被演算法選中。
+    </p>
+    <h2>輸入資料前處理</h2>
+    <p>本工具會將輸入資料做預先處理。以中文語料為例，處理流程大致如下：</p>
+    <p>1. 斷句：使用中文的全形句號（。）及換行進行斷句</p>
+    <p>2. 斷詞並標記詞性：使用 Jieba 將每個句子分別斷詞，並標註其詞性</p>
+    <p>3. 過濾詞性：將英文及數字過濾，以免產生過多雜訊</p>
+    <h2>停用詞</h2>
+    <p>使用者可以編輯停用詞，被設定為停用詞的單詞，將不會被選擇到 sententree 中。</p>
+    <p>在輸入停用詞時，可以一次輸入多個停用詞，並使用空格（半形）分開。</p>
+    <h2>Sententree 圖形</h2>
+    <p>輸入一份文件，預設會產生2個 sententree 圖形，圖 1 為一個 sententree 的圖形</p>
+    <p>每個圖形中間最大的單詞為<span style="color:blue;">根單詞</span></p>
+    <p>其中第二個圖形中不會包含第一個圖形的根單詞</p>
+    <p>單詞之間的連線代表兩個單詞有在同一個句子中出現過</p>
+    <p>灰色連線代表演算法在搜尋時，兩個單詞屬於同一個階層（出現在相同的句子中）</p>
+    <p>橘色連線代表兩個單詞屬於不同階層</p>
+    <p>連線的粗細代表兩個單詞同時出現的比例</p>
+    <p>將滑鼠移到單詞上，能夠看到包含該單詞的完整句子（最多顯示 5 筆），如圖 2</p>
+    <img style="width: 100%;" src="/img/general_txt_help_g01.png">
+    <span>圖 1：Sententree 圖形</span>
+    <img style="width: 100%;" src="/img/general_txt_help_g02.png">
+    <span>圖 2：完整句子顯示</span>
+</body>
+
+</html>
Author	SHA1	Message	Date
zovjsra	03ceb99422	change app.py	1 year ago
zovjsra	2c3fe7b4d7	加入說明	1 year ago
zovjsra	61f5e1e4a3	修改用詞	1 year ago
zovjsra	aa37c708e7	change tooltip content	1 year ago
zovjsra	42f30e4598	change tooltip content	1 year ago
zovjsra	660460d01d	add tooltips	1 year ago
zovjsra	cac3bae57e	change html	2 years ago
zovjsra	4ba0a7e901	加入ratio調整及滑鼠事件	2 years ago
zovjsra	04d1eb79ba	該顏色、數字數	2 years ago
zovjsra	6ca4361e58	fix dockerfile	2 years ago
zovjsra	4b6950c55a	turn off debug mode	2 years ago
zovjsra	2e7e9760c2	Modify for docker	2 years ago
Zovjsra	cd07e4f5f4	Merge remote-tracking branch 'origin/dev/addPush'	4 years ago
Zovjsra	7c4e83d8d8	哈哈	4 years ago
Zovjsra	9dab751687	Merge branch 'dev/addPush'	4 years ago
Zovjsra	7b64c699f7	Merge branch 'dev/addPush'	4 years ago
Zovjsra	4fba9b6a27	commit	4 years ago
Zovjsra	d191543da7	Merge branch 'dev/addPush'	4 years ago
Zovjsra	0110d392b4	Merge branch 'dev/addPush'	4 years ago
Zovjsra	a4059c6491	Change port	4 years ago
Zovjsra	131e7da0d9	Merge branch 'dev/addPush'	4 years ago
Zovjsra	0837db378c	Merge branch 'dev/addPush'	5 years ago
Zovjsra	d4c50b5147	to merge	5 years ago
Zovjsra	e4daf35734	to merge	5 years ago
Zovjsra	e1625ce793	Merge branch 'dev/addPush'	5 years ago
Zovjsra	ccc1839d2a	Update README.md	5 years ago