Compare commits

..

26 Commits

@ -0,0 +1,10 @@
FROM python:3.8.16-bullseye
ADD . /app
RUN pip install flask flask-compress jieba numpy progressbar2 nltk langdetect
WORKDIR /app
RUN mkdir data
ENTRYPOINT [ "/usr/local/bin/python", "app.py" ]

@ -1 +1,3 @@
# ptt-sententree # ptt-sententree
## Reference
https://github.com/twitter/SentenTree

@ -6,8 +6,8 @@ from pprint import pprint
import threading import threading
import random import random
import string import string
import dataHandlerPTT as ptt # import dataHandlerPTT as ptt
import dataHandlerPTTPush as pttPush # import dataHandlerPTTPush as pttPush
import generalText as gen import generalText as gen
import json import json
@ -33,9 +33,9 @@ def eventStream(eventQueue):
yield "event:{event}\n{data}\n\n".format(event=eventNode['event'], data=data) yield "event:{event}\n{data}\n\n".format(event=eventNode['event'], data=data)
@app.route('/data/<path:path>') @app.route('/img/<path:path>')
def send_data(path): def send_data(path):
return send_from_directory('data', path) return send_from_directory('resource/img', path)
@app.route('/generalTxt') @app.route('/generalTxt')
@ -57,7 +57,7 @@ def generalText_addText():
@app.route('/') @app.route('/')
def index(): def index():
return redirect('/ptt') return redirect('/generalTxt')
@app.route('/ptt_push') @app.route('/ptt_push')
@ -151,10 +151,13 @@ def send_resource(path):
return send_from_directory('resource', path) return send_from_directory('resource', path)
@app.route("/dcard_dev") @app.route('/generaltxt/help')
def dcard_dev(): def generaltxt_help():
return render_template('dcard.html', title='DCard Sentntree 測試版') return render_template('generaltxt_help.html', title="使用說明")
@app.route('/data/<path:path>')
def get_data(path):
return send_from_directory('data', path)
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True, port=4998, host='0.0.0.0', threaded=True) app.run(debug=True, port=4980, host='0.0.0.0', threaded=False)

@ -15,7 +15,7 @@ from numpy import prod
from jieba import posseg from jieba import posseg
from progressbar import ProgressBar from progressbar import ProgressBar
from datetime import datetime from datetime import datetime
from PTTData import PTTData #from PTTData import PTTData
defaultDate = { defaultDate = {
@ -29,7 +29,7 @@ with open('/home/vis/pttDatabase/PTTData/Gossiping/content/content.pck', 'rb') a
f.close() f.close()
defaultStopWords = [] defaultStopWords = []
data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData') #data = PTTData('Gossiping', '/home/vis/pttDatabase/PTTData')
sentence_length = 100 sentence_length = 100
use_push_count = False use_push_count = False

@ -2,6 +2,7 @@ import jieba
import csv import csv
import nltk import nltk
import re import re
import json
from jieba import posseg from jieba import posseg
from nltk import tokenize from nltk import tokenize
from langdetect import detect from langdetect import detect
@ -27,42 +28,41 @@ def filterPOS(text):
def processText(randId, text, stopwords): def processText(randId, text, stopwords):
if(text == ''): if (text == ''):
return '' return ''
lang = detect(text) lang = detect(text)
sentenses = [] sentenses = []
sentenses_raw = []
print(lang) print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text) splitted = re.split('。|[\n]+', text)
print(splitted) print(splitted)
cutted = [] cutted = []
for i in splitted: for spl in splitted:
cutted.append(filterPOS(i)) cutted.append(filterPOS(spl))
print(cutted) print(cutted)
for i in cutted: for spl, raw in zip(cutted, splitted):
result = [] sentenses.append(' '.join(spl))
for j in i: sentenses_raw.append(raw)
if (j in stopwords):
continue
result.append(j)
if (len(result) >= 20):
sentenses.append(' '.join(result.copy()))
result = []
if (result != []):
sentenses.append(' '.join(result))
else: else:
sentenses = [] sentenses = []
for sentence in tokenize.sent_tokenize(text): sentenses_raw = []
words = sentence.lower().split(' ') for sentence_raw in tokenize.sent_tokenize(text):
words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence) sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
result = [] result = []
result.append(['id', 'text', 'count']) for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
for index, sentence in enumerate(sentenses): sentence, sentence_raw = raw_pair
result.append([index, sentence, 1000]) result.append({
with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f: 'id': index,
writer = csv.writer(f, delimiter='\t') 'text': sentence,
writer.writerows(result) 'count': 10,
f.close() 'rawtxt': sentence_raw,
return ('data/' + randId + '.tsv') })
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

@ -186,6 +186,7 @@ li a {
from { from {
opacity: 0; opacity: 0;
} }
to { to {
opacity: 1; opacity: 1;
} }
@ -202,6 +203,7 @@ li a {
from { from {
opacity: 1; opacity: 1;
} }
to { to {
opacity: 0; opacity: 0;
} }
@ -260,7 +262,7 @@ li a {
#vis { #vis {
display: inline-block; display: inline-block;
background-color: aliceblue; background-color: transparent;
position: relative; position: relative;
border-radius: 30px; border-radius: 30px;
resize: both; resize: both;

@ -1,6 +1,33 @@
var tsvPath var tsvPath
var stopwords = [] var stopwords = []
const init = () => {
$(window).on('mousemove', (e) => {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
$('#minRatioLabel').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最小值,小於該值不會被演算法選擇')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
$('#maxRatioLabel').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('兩個相鄰單詞之間出現頻率比值的最大值,大於該值不會被演算法選擇')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
$('#wordcount').on('mouseenter', () => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('僅計算中文字的字數')
}).on('mouseleave', () => {
$('#nodeTitle').toggleClass('hidden')
})
}
function clearStopWord() { function clearStopWord() {
stopwords = [] stopwords = []
$('#sweContainer').html('') $('#sweContainer').html('')
@ -15,7 +42,7 @@ function addStopWord() {
} else { } else {
stopwords.push(newsw) stopwords.push(newsw)
$('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -33,14 +60,14 @@ function addStopWord() {
function showStopwordEditor() { function showStopwordEditor() {
console.log(stopwords) console.log(stopwords)
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
addStopWord() addStopWord()
} }
}) })
$('#sweContainer').empty() $('#sweContainer').empty()
for (word of stopwords) { for (word of stopwords) {
$('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -53,7 +80,7 @@ function showStopwordEditor() {
function hideStopWordEditor() { function hideStopWordEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
@ -74,7 +101,7 @@ function submit() {
stopwords: stopwords stopwords: stopwords
}), }),
contentType: 'application/json', contentType: 'application/json',
success: function(data) { success: function (data) {
tsvPath = data.Result.path tsvPath = data.Result.path
destroyCurrentGraph() destroyCurrentGraph()
d3.select('#graph').append('div').attr('id', 'vis') d3.select('#graph').append('div').attr('id', 'vis')
@ -86,23 +113,42 @@ function submit() {
function buildSentetree() { function buildSentetree() {
console.log("Build.") console.log("Build.")
var model; let model;
var tree; let tree;
var data; let data;
const graph = d3.tsv(tsvPath, buildTree); const graph = d3.json(tsvPath, buildTree);
function buildTree(error, rawdata) { function buildTree(error, rawdata) {
console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count })); const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
console.log({ data })
let minRatio = $('#minRatio').val()
let maxRatio = $('#maxRatio').val()
console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder() model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace) .tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token)) .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
.buildModel(data, { maxSupportRatio: 1 }); .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
tree = new SentenTree.SentenTreeVis('#vis', { tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40], fontSize: [15, 40],
gapBetweenGraph: 10 gapBetweenGraph: 10
}); });
tree.data(model.getRenderedGraphs(2)) tree.data(model.getRenderedGraphs(2))
new ResizeSensor(jQuery('#d3kitRoot'), function() { .on('nodeMouseenter', (node) => {
console.log(node)
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('<ul>' + node.data.topEntries.map((n) => "<li>" + data[n.id].rawtxt + "</li>").join('') + "</ul>")
})
.on('nodeMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
.on('linkMouseenter', (node) => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html('出現次數:' + (node.freq / 10))
}).on('linkMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -126,3 +172,12 @@ function switchMessageBox() {
$('#toggleTextBox').html('隱藏文字輸入區') $('#toggleTextBox').html('隱藏文字輸入區')
} }
} }
function countWords() {
text = $("#rawTextBox").val()
let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1
console.log(wordCount)
$("#wordcount").html('字數:' + wordCount)
}
init()

@ -17,7 +17,7 @@ function init() {
type: 'POST', type: 'POST',
url: '/init', url: '/init',
dataType: 'json', dataType: 'json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
setDate(data.Result.startDate, data.Result.endDate) setDate(data.Result.startDate, data.Result.endDate)
document.getElementById('keywordBox').value = data.Result.keyword document.getElementById('keywordBox').value = data.Result.keyword
@ -39,44 +39,44 @@ function init() {
buildSentetree() buildSentetree()
} }
}) })
$(document).ready(function() { $(document).ready(function () {
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
} }
}); });
}); });
$(window).on('mousemove', function(e) { $(window).on('mousemove', function (e) {
$('#nodeTitle').css({ $('#nodeTitle').css({
left: e.pageX, left: e.pageX,
top: e.pageY top: e.pageY
}) })
}) })
$('#titleListContainer').hover( $('#titleListContainer').hover(
function() { // Run on hover/mouseenter function () { // Run on hover/mouseenter
$(this).css('overflow', 'auto') $(this).css('overflow', 'auto')
}, },
function() { // Run on mouseleave function () { // Run on mouseleave
$(this).css('overflow', 'hidden') $(this).css('overflow', 'hidden')
} }
) )
$('#titleListLayer').click(function(e) { $('#titleListLayer').click(function (e) {
if ($('#titleListLayer').is(e.target)) { if ($('#titleListLayer').is(e.target)) {
hideTitles() hideTitles()
} }
}) })
$('#stopWordEditorLayer').click(function(e) { $('#stopWordEditorLayer').click(function (e) {
if ($('#stopWordEditorLayer').is(e.target)) { if ($('#stopWordEditorLayer').is(e.target)) {
hideStopWordEditor() hideStopWordEditor()
} }
}) })
$('#idfEditorLayer').click(function(e) { $('#idfEditorLayer').click(function (e) {
if ($('#idfEditorLayer').is(e.target)) { if ($('#idfEditorLayer').is(e.target)) {
hideIdfEditor() hideIdfEditor()
} }
}) })
$('#pttPageWindow').click(function(e) { $('#pttPageWindow').click(function (e) {
if ($('#pttPageWindow').is(e.target)) { if ($('#pttPageWindow').is(e.target)) {
hidePTTPage() hidePTTPage()
} }
@ -127,7 +127,7 @@ function addStopWord() {
} else { } else {
stopwords.push(newsw) stopwords.push(newsw)
$('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -172,14 +172,14 @@ function scrollIdfList() {
function showStopwordEditor() { function showStopwordEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
addStopWord() addStopWord()
} }
}) })
$('#sweContainer').empty() $('#sweContainer').empty()
for (word of stopwords) { for (word of stopwords) {
$('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -192,7 +192,7 @@ function showStopwordEditor() {
function showIdfEditor() { function showIdfEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
scrollIdfList() scrollIdfList()
} }
@ -226,7 +226,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('設為最小') .html('設為最小')
.click(function() { .click(function () {
$(this) $(this)
.parent() .parent()
.parent() .parent()
@ -236,7 +236,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('設為最大') .html('設為最大')
.click(function() { .click(function () {
$(this) $(this)
.parent() .parent()
.parent() .parent()
@ -248,7 +248,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('重設') .html('重設')
.click(function() { .click(function () {
var _word = $($(this) var _word = $($(this)
.parent() .parent()
.parent() .parent()
@ -275,7 +275,7 @@ function showIdfEditor() {
function hideStopWordEditor() { function hideStopWordEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
@ -286,7 +286,7 @@ function hideStopWordEditor() {
function hideIdfEditor() { function hideIdfEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
@ -369,7 +369,7 @@ function sendRequest() {
url: '/addRequest', url: '/addRequest',
data: content, data: content,
contentType: 'application/json', contentType: 'application/json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
changeGraph(data.Result) changeGraph(data.Result)
} }
@ -460,14 +460,14 @@ function buildSentetree() {
}) })
}) })
} else { } else {
seqList = node.data.seq.DBs.map(function(n) { seqList = node.data.seq.DBs.map(function (n) {
return n.rawText return n.rawText
}) })
} }
titleList = [] titleList = []
for (s of seqList) { for (s of seqList) {
titleTemp = wordTitleList[s] titleTemp = wordTitleList[s]
if ((titleList.map(function(n) { if ((titleList.map(function (n) {
return n.title return n.title
})).indexOf(titleTemp.title) == -1) { })).indexOf(titleTemp.title) == -1) {
titleList.push(titleTemp) titleList.push(titleTemp)
@ -508,7 +508,7 @@ function buildSentetree() {
globKeyword: globKeyword globKeyword: globKeyword
}), }),
contentType: 'application/json', contentType: 'application/json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
$('#titleListKeywordInfo').html('單詞出現次數:' + data.Result.wordCount + ', 單詞出現的文章數:' + data.Result.postCount + ', 單詞頻率:' + (data.Result.postCount * 100 / totalPosts).toFixed(2) + '%') $('#titleListKeywordInfo').html('單詞出現次數:' + data.Result.wordCount + ', 單詞出現的文章數:' + data.Result.postCount + ', 單詞頻率:' + (data.Result.postCount * 100 / totalPosts).toFixed(2) + '%')
} }
@ -527,7 +527,7 @@ function buildSentetree() {
).append( ).append(
$('<span>').attr('style', 'margin: 0px 10px').html('推文數:' + i.pushes) $('<span>').attr('style', 'margin: 0px 10px').html('推文數:' + i.pushes)
) )
).click(function() { ).click(function () {
let indx = $(this).index() let indx = $(this).index()
showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc')) showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc'))
}) })
@ -536,7 +536,7 @@ function buildSentetree() {
}) })
.on('nodeMouseenter', node => { .on('nodeMouseenter', node => {
console.log(node) console.log(node)
titles = node.data.topEntries.map(function(x) { titles = node.data.topEntries.map(function (x) {
return wordTitleList[x.rawText] return wordTitleList[x.rawText]
}) })
console.log(titles) console.log(titles)
@ -564,7 +564,7 @@ function buildSentetree() {
}).on('linkMouseenter', link => { }).on('linkMouseenter', link => {
console.log(link) console.log(link)
}) })
new ResizeSensor(jQuery('#d3kitRoot'), function() { new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))

@ -9,44 +9,58 @@
</head> </head>
<body> <body>
<div id="nodeTitle" class="nodeTitle hidden">
<div id="nodeTitleContent">test</div>
</div>
<div id="stopWordEditorLayer" class="info hidden"> <div id="stopWordEditorLayer" class="info hidden">
<div id="stopWordEditor"> <div id="stopWordEditor">
<h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4> <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
<ul id="sweContainer" class="w3-ul w3-hoverable" style="margin-bottom: 10px;"></ul> <ul id="sweContainer" class="w3-ul w3-hoverable" style="margin-bottom: 10px;"></ul>
<div> <div>
<input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord" placeholder="新增停用詞(以空白隔開)"> <input class="w3-input w3-border" style="width: 85%; display: inline;" type="text" id="newStopWord"
<button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button> placeholder="新增停用詞(以空白隔開)">
<button class="general-button w3-right" type="button" id="confirm"
style="background-color: #379; margin-left: 8px;" onclick="addStopWord()">新增</button>
</div> </div>
<div id="sweButtons" style="margin: 20px 0px;"> <div id="sweButtons" style="margin: 20px 0px;">
<button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="hideStopWordEditor(); submit()">確認</button> <button class="general-button" type="button" id="confirm"
<button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button> style="background-color: #379; margin: 0px 10px"
<button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button> onclick="hideStopWordEditor(); submit()">確認</button>
<button class="general-button w3-right" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button> <button class="general-button" type="button" id="confirm"
style="background-color: #379; margin: 0px 10px" onclick="downloadStopWord()">匯出停用詞</button>
<button class="general-button" type="button" id="confirm"
style="background-color: #379; margin: 0px 10px" onclick="clearStopWord()">全部清除</button>
<button class="general-button w3-right" type="button" id="confirm"
style="background-color: #379; margin: 0px 20px" onclick="hideStopWordEditor()">返回</button>
</div> </div>
</div> </div>
</div> </div>
<div class='w3-bar w3-teal'> <div class='w3-bar w3-teal'>
<button class="w3-button w3-teal" type="button" onclick="location.href='/ptt'">PTT Sententree</button> <button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'"
<button class="w3-button" type="button" onclick="location.href='/ptt_push'">推文Sententree</button> style="color: darkseagreen;">泛用文字視覺化工具</button>
<button class="w3-button w3-teal" type="button" onclick="location.href='/generalTxt'" style="color: darkseagreen;">泛用文字視覺化工具</button>
</div> </div>
<div id='heading'> <div id='heading'>
<h2>泛用文字視覺化工具</h2> <h2>泛用文字視覺化工具</h2>
<p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p> <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
<p id='comment'>這是泛用.txt檔視覺化工具能夠簡單處理文字檔的視覺化。</p> <p id="comment">點此查看<a href="/generaltxt/help">使用說明</a></p>
<p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p>
<p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
</div> </div>
<div style="margin:10px;"> <div style="margin:10px;">
<button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" onclick="showStopwordEditor()">編輯停用詞</button> <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
onclick="showStopwordEditor()">編輯停用詞</button>
<label id="minRatioLabel">Min Ratio</label>
<input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
<label id="maxRatioLabel">Max Ratio</label>
<input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
</div> </div>
<div id='rawText' class=''> <div id='rawText' class=''>
<div id="wordcount">字數0</div>
<textarea id='rawTextBox' rows=25 placeholder="輸入要視覺化的文字 <textarea id='rawTextBox' rows=25 placeholder="輸入要視覺化的文字
換行為斷句"></textarea> 換行為斷句" onchange="countWords()"></textarea>
<button class='general-button' style='margin: 10px 0px' onclick="submit()">提交</button> <button class='general-button' style='margin: 10px 0px' onclick="submit()">提交</button>
</div> </div>
<div> <div>
<button id='toggleTextBox' class='general-button' style='margin: 0px 10px' onclick="switchMessageBox()">隱藏文字視窗</button> <button id='toggleTextBox' class='general-button' style='margin: 0px 10px'
onclick="switchMessageBox()">隱藏文字視窗</button>
<div id='graph' class='hidden'> <div id='graph' class='hidden'>
<div id='vis'></div> <div id='vis'></div>
</div> </div>

@ -0,0 +1,46 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="/static/css/w3.css" type="text/css" rel="stylesheet">
<link href="/static/css/main.css" type="text/css" rel="stylesheet">
<title>使用說明</title>
</head>
<body>
<h1>使用說明</h1>
<h2>基本說明</h2>
<p>泛用文字視覺化工具能夠簡單處理文字檔的視覺化。<br>
支援的語言:繁體中文、英文以及所有使用空格分詞的語言。<br>
使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
<h2>參數</h2>
<p>此工具提供 <span style="color: red;">minRatio</span><span style="color: red;">maxRatio</span> 兩個參數的設定<br>
兩個參數代表相鄰兩個單詞(有被連接的單詞)之間的最大或最小比值<br>
例如maxRatio 為 0.8 時,代表兩個相鄰的單詞出現的頻率必須小於 0.8,否則單詞就不會被演算法選中。
</p>
<h2>輸入資料前處理</h2>
<p>本工具會將輸入資料做預先處理。以中文語料為例,處理流程大致如下:</p>
<p>1. 斷句:使用中文的全形句號(。)及換行進行斷句</p>
<p>2. 斷詞並標記詞性:使用 Jieba 將每個句子分別斷詞,並標註其詞性</p>
<p>3. 過濾詞性:將英文及數字過濾,以免產生過多雜訊</p>
<h2>停用詞</h2>
<p>使用者可以編輯停用詞,被設定為停用詞的單詞,將不會被選擇到 sententree 中。</p>
<p>在輸入停用詞時,可以一次輸入多個停用詞,並使用空格(半形)分開。</p>
<h2>Sententree 圖形</h2>
<p>輸入一份文件預設會產生2個 sententree 圖形,圖 1 為一個 sententree 的圖形</p>
<p>每個圖形中間最大的單詞為<span style="color:blue;">根單詞</span></p>
<p>其中第二個圖形中不會包含第一個圖形的根單詞</p>
<p>單詞之間的連線代表兩個單詞有在同一個句子中出現過</p>
<p>灰色連線代表演算法在搜尋時,兩個單詞屬於同一個階層(出現在相同的句子中)</p>
<p>橘色連線代表兩個單詞屬於不同階層</p>
<p>連線的粗細代表兩個單詞同時出現的比例</p>
<p>將滑鼠移到單詞上,能夠看到包含該單詞的完整句子(最多顯示 5 筆),如圖 2</p>
<img style="width: 100%;" src="/img/general_txt_help_g01.png">
<span>圖 1Sententree 圖形</span>
<img style="width: 100%;" src="/img/general_txt_help_g02.png">
<span>圖 2完整句子顯示</span>
</body>
</html>
Loading…
Cancel
Save