Compare commits

..

2 Commits

Author SHA1 Message Date
Zovjsra c2b9dc1b42 change port
4 years ago
Zovjsra c461ca1a9d change port
4 years ago

@ -1,10 +0,0 @@
FROM python:3.8.16-bullseye
ADD . /app
RUN pip install flask flask-compress jieba numpy progressbar2 nltk langdetect
WORKDIR /app
RUN mkdir data
ENTRYPOINT [ "/usr/local/bin/python", "app.py" ]

@ -1,3 +1 @@
# ptt-sententree # ptt-sententree
## Reference
https://github.com/twitter/SentenTree

@ -6,8 +6,8 @@ from pprint import pprint
import threading import threading
import random import random
import string import string
# import dataHandlerPTT as ptt import dataHandlerPTT as ptt
# import dataHandlerPTTPush as pttPush import dataHandlerPTTPush as pttPush
import generalText as gen import generalText as gen
import json import json
@ -33,9 +33,9 @@ def eventStream(eventQueue):
yield "event:{event}\n{data}\n\n".format(event=eventNode['event'], data=data) yield "event:{event}\n{data}\n\n".format(event=eventNode['event'], data=data)
@app.route('/img/<path:path>') @app.route('/data/<path:path>')
def send_data(path): def send_data(path):
return send_from_directory('resource/img', path) return send_from_directory('data', path)
@app.route('/generalTxt') @app.route('/generalTxt')
@ -57,7 +57,7 @@ def generalText_addText():
@app.route('/') @app.route('/')
def index(): def index():
return redirect('/generalTxt') return redirect('/ptt')
@app.route('/ptt_push') @app.route('/ptt_push')
@ -151,13 +151,10 @@ def send_resource(path):
return send_from_directory('resource', path) return send_from_directory('resource', path)
@app.route('/generaltxt/help') @app.route("/dcard_dev")
def generaltxt_help(): def dcard_dev():
return render_template('generaltxt_help.html', title="使用說明") return render_template('dcard.html', title='DCard Sentntree 測試版')
@app.route('/data/<path:path>')
def get_data(path):
return send_from_directory('data', path)
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True, port=4980, host='0.0.0.0', threaded=False) app.run(debug=True, port=4998, host='0.0.0.0', threaded=True)

@ -15,7 +15,7 @@ from numpy import prod
from jieba import posseg from jieba import posseg
from progressbar import ProgressBar from progressbar import ProgressBar
from datetime import datetime from datetime import datetime
#from PTTData import PTTData from PTTData import PTTData
defaultDate = { defaultDate = {
@ -29,7 +29,7 @@ with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') a
f.close() f.close()
defaultStopWords = [] defaultStopWords = []
#data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData') data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
sentence_length = 100 sentence_length = 100
use_push_count = False use_push_count = False

@ -2,7 +2,6 @@ import jieba
import csv import csv
import nltk import nltk
import re import re
import json
from jieba import posseg from jieba import posseg
from nltk import tokenize from nltk import tokenize
from langdetect import detect from langdetect import detect
@ -32,37 +31,38 @@ def processText(randId, text, stopwords):
return '' return ''
lang = detect(text) lang = detect(text)
sentenses = [] sentenses = []
sentenses_raw = []
print(lang) print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text) splitted = re.split('。|[\n]+', text)
print(splitted) print(splitted)
cutted = [] cutted = []
for spl in splitted: for i in splitted:
cutted.append(filterPOS(spl)) cutted.append(filterPOS(i))
print(cutted) print(cutted)
for spl, raw in zip(cutted, splitted): for i in cutted:
sentenses.append(' '.join(spl)) result = []
sentenses_raw.append(raw) for j in i:
if (j in stopwords):
continue
result.append(j)
if (len(result) >= 20):
sentenses.append(' '.join(result.copy()))
result = []
if (result != []):
sentenses.append(' '.join(result))
else: else:
sentenses = [] sentenses = []
sentenses_raw = [] for sentence in tokenize.sent_tokenize(text):
for sentence_raw in tokenize.sent_tokenize(text): words = sentence.lower().split(' ')
words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence) sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
result = [] result = []
for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)): result.append(['id', 'text', 'count'])
sentence, sentence_raw = raw_pair for index, sentence in enumerate(sentenses):
result.append({ result.append([index, sentence, 1000])
'id': index, with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
'text': sentence, writer = csv.writer(f, delimiter='\t')
'count': 10, writer.writerows(result)
'rawtxt': sentence_raw, f.close()
}) return ('data/' + randId + '.tsv')
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 93 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 250 KiB

@ -186,7 +186,6 @@ li a {
from { from {
opacity: 0; opacity: 0;
} }
to { to {
opacity: 1; opacity: 1;
} }
@ -203,7 +202,6 @@ li a {
from { from {
opacity: 1; opacity: 1;
} }
to { to {
opacity: 0; opacity: 0;
} }
@ -262,7 +260,7 @@ li a {
#vis { #vis {
display: inline-block; display: inline-block;
background-color: transparent; background-color: aliceblue;
position: relative; position: relative;
border-radius: 30px; border-radius: 30px;
resize: both; resize: both;

@ -1,33 +1,6 @@
var tsvPath var tsvPath
var stopwords = [] var stopwords = []
const init = () => {
$(window).on('mousemove', (e) => {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
$('#minRatioLabel').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最小值,小於該值不會被演算法選擇')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
$('#maxRatioLabel').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最大值,大於該值不會被演算法選擇')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
$('#wordcount').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('僅計算中文字的字數')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
}
function clearStopWord() { function clearStopWord() {
stopwords = [] stopwords = []
$('#sweContainer').html('') $('#sweContainer').html('')
@ -113,41 +86,22 @@ function submit() {
function buildSentetree() { function buildSentetree() {
console.log("Build.") console.log("Build.")
let model; var model;
let tree; var tree;
let data; var data;
const graph = d3.json(tsvPath, buildTree); const graph = d3.tsv(tsvPath, buildTree);
function buildTree(error, rawdata) { function buildTree(error, rawdata) {
console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count })); const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
console.log({ data })
let minRatio = $('#minRatio').val()
let maxRatio = $('#maxRatio').val()
console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder() model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace) .tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token)) .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
.buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio }); .buildModel(data, { maxSupportRatio: 1 });
tree = new SentenTree.SentenTreeVis('#vis', { tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40], fontSize: [15, 40],
gapBetweenGraph: 10 gapBetweenGraph: 10
}); });
tree.data(model.getRenderedGraphs(2)) tree.data(model.getRenderedGraphs(2))
.on('nodeMouseenter', (node) => {
console.log(node)
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('<ul>' + node.data.topEntries.map((n) => "<li>" + data[n.id].rawtxt + "</li>").join('') + "</ul>")
})
.on('nodeMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
.on('linkMouseenter', (node) => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('出現次數:' + (node.freq / 10))
}).on('linkMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
new ResizeSensor(jQuery('#d3kitRoot'), function() { new ResizeSensor(jQuery('#d3kitRoot'), function() {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -172,12 +126,3 @@ function switchMessageBox() {
$('#toggleTextBox').html('隱藏文字輸入區') $('#toggleTextBox').html('隱藏文字輸入區')
} }
} }
function countWords() {
text = $("#rawTextBox").val()
let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1
console.log(wordCount)
$("#wordcount").html('字數:' + wordCount)
}
init()

@ -9,58 +9,44 @@
</head> </head>
<body> <body>
<div id="nodeTitle" class="nodeTitle hidden">
<div id="nodeTitleContent">test</div>
</div>
<div id="stopWordEditorLayer" class="info hidden"> <div id="stopWordEditorLayer" class="info hidden">
<div id="stopWordEditor"> <div id="stopWordEditor">
<h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4> <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
<ul id="sweContainer" class="w3-ul w3-hoverable" style="margin-bottom: 10px;"></ul> <ul id="sweContainer" class="w3-ul w3-hoverable" style="margin-bottom: 10px;"></ul>
<div> <div>
<input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord" <input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord" placeholder="新增停用詞(以空白隔開)">
placeholder="新增停用詞(以空白隔開)"> <button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button>
<button class="general-button w3-right" type="button" id="confirm"
style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button>
</div> </div>
<div id="sweButtons" style="margin: 20px 0px;"> <div id="sweButtons" style="margin: 20px 0px;">
<button class="general-button" type="button" id="confirm" <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="hideStopWordEditor(); submit()">確認</button>
style="background-color: #379; margin: 0px 10px" <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button>
onclick="hideStopWordEditor(); submit()">確認</button> <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button>
<button class="general-button" type="button" id="confirm" <button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button>
style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button>
<button class="general-button" type="button" id="confirm"
style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button>
<button class="general-button w3-right" type="button" id="confirm"
style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button>
</div> </div>
</div> </div>
</div> </div>
<div class='w3-bar w3-teal'> <div class='w3-bar w3-teal'>
<button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'" <button class="w3-button w3-teal" type="button" onclick="location.href='/ptt'">PTT Sententree</button>
style="color: darkseagreen;">泛用文字視覺化工具</button> <button class="w3-button" type="button" onclick="location.href='/ptt_push'">推文Sententree</button>
<button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'" style="color: darkseagreen;">泛用文字視覺化工具</button>
</div> </div>
<div id='heading'> <div id='heading'>
<h2>泛用文字視覺化工具</h2> <h2>泛用文字視覺化工具</h2>
<p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p> <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
<p id="comment">點此查看<a href="/generaltxt/help">使用說明</a></p> <p id='comment'>這是泛用.txt檔視覺化工具能夠簡單處理文字檔的視覺化。</p>
<p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p>
<p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
</div> </div>
<div style="margin:10px;"> <div style="margin:10px;">
<button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" onclick="showStopwordEditor()">編輯停用詞</button>
onclick="showStopwordEditor()">編輯停用詞</button>
<label id="minRatioLabel">Min Ratio</label>
<input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
<label id="maxRatioLabel">Max Ratio</label>
<input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
</div> </div>
<div id='rawText' class=''> <div id='rawText' class=''>
<div id="wordcount">字數0</div>
<textarea id='rawTextBox' rows=25 placeholder="輸入要視覺化的文字 <textarea id='rawTextBox' rows=25 placeholder="輸入要視覺化的文字
換行為斷句" onchange="countWords()"></textarea> 換行為斷句"></textarea>
<button class='general-button' style='margin: 10px 0px' onclick="submit()">提交</button> <button class='general-button' style='margin: 10px 0px' onclick="submit()">提交</button>
</div> </div>
<div> <div>
<button id='toggleTextBox' class='general-button' style='margin: 0px 10px' <button id='toggleTextBox' class='general-button' style='margin: 0px 10px' onclick="switchMessageBox()">隱藏文字視窗</button>
onclick="switchMessageBox()">隱藏文字視窗</button>
<div id='graph' class='hidden'> <div id='graph' class='hidden'>
<div id='vis'></div> <div id='vis'></div>
</div> </div>

@ -1,46 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="/static/css/w3.css" type="text/css" rel="stylesheet">
<link href="/static/css/main.css" type="text/css" rel="stylesheet">
<title>使用說明</title>
</head>
<body>
<h1>使用說明</h1>
<h2>基本說明</h2>
<p>泛用文字視覺化工具能夠簡單處理文字檔的視覺化。<br>
支援的語言:繁體中文、英文以及所有使用空格分詞的語言。<br>
使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
<h2>參數</h2>
<p>此工具提供 <span style="color: red;">minRatio</span><span style="color: red;">maxRatio</span> 兩個參數的設定<br>
兩個參數代表相鄰兩個單詞(有被連接的單詞)之間的最大或最小比值<br>
例如maxRatio 為 0.8 時,代表兩個相鄰的單詞出現的頻率必須小於 0.8,否則單詞就不會被演算法選中。
</p>
<h2>輸入資料前處理</h2>
<p>本工具會將輸入資料做預先處理。以中文語料為例,處理流程大致如下:</p>
<p>1. 斷句:使用中文的全形句號(。)及換行進行斷句</p>
<p>2. 斷詞並標記詞性:使用 Jieba 將每個句子分別斷詞,並標註其詞性</p>
<p>3. 過濾詞性:將英文及數字過濾,以免產生過多雜訊</p>
<h2>停用詞</h2>
<p>使用者可以編輯停用詞,被設定為停用詞的單詞,將不會被選擇到 sententree 中。</p>
<p>在輸入停用詞時,可以一次輸入多個停用詞,並使用空格(半形)分開。</p>
<h2>Sententree 圖形</h2>
<p>輸入一份文件預設會產生2個 sententree 圖形,圖 1 為一個 sententree 的圖形</p>
<p>每個圖形中間最大的單詞為<span style="color:blue;">根單詞</span></p>
<p>其中第二個圖形中不會包含第一個圖形的根單詞</p>
<p>單詞之間的連線代表兩個單詞有在同一個句子中出現過</p>
<p>灰色連線代表演算法在搜尋時,兩個單詞屬於同一個階層(出現在相同的句子中)</p>
<p>橘色連線代表兩個單詞屬於不同階層</p>
<p>連線的粗細代表兩個單詞同時出現的比例</p>
<p>將滑鼠移到單詞上,能夠看到包含該單詞的完整句子(最多顯示 5 筆),如圖 2</p>
<img style="width: 100%;" src="/img/general_txt_help_g01.png">
<span>圖 1Sententree 圖形</span>
<img style="width: 100%;" src="/img/general_txt_help_g02.png">
<span>圖 2完整句子顯示</span>
</body>
</html>
Loading…
Cancel
Save