加入ratio調整及滑鼠事件

master
zovjsra 2 years ago
parent 04d1eb79ba
commit 4ba0a7e901

@ -2,6 +2,7 @@ import jieba
import csv import csv
import nltk import nltk
import re import re
import json
from jieba import posseg from jieba import posseg
from nltk import tokenize from nltk import tokenize
from langdetect import detect from langdetect import detect
@ -31,38 +32,37 @@ def processText(randId, text, stopwords):
return '' return ''
lang = detect(text) lang = detect(text)
sentenses = [] sentenses = []
sentenses_raw = []
print(lang) print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text) splitted = re.split('。|[\n]+', text)
print(splitted) print(splitted)
cutted = [] cutted = []
for i in splitted: for spl in splitted:
cutted.append(filterPOS(i)) cutted.append(filterPOS(spl))
print(cutted) print(cutted)
for i in cutted: for spl, raw in zip(cutted, splitted):
result = [] sentenses.append(' '.join(spl))
for j in i: sentenses_raw.append(raw)
if (j in stopwords):
continue
result.append(j)
if (len(result) >= 20):
sentenses.append(' '.join(result.copy()))
result = []
if (result != []):
sentenses.append(' '.join(result))
else: else:
sentenses = [] sentenses = []
for sentence in tokenize.sent_tokenize(text): sentenses_raw = []
words = sentence.lower().split(' ') for sentence_raw in tokenize.sent_tokenize(text):
words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence) sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
result = [] result = []
result.append(['id', 'text', 'count']) for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
for index, sentence in enumerate(sentenses): sentence, sentence_raw = raw_pair
result.append([index, sentence, 1000]) result.append({
with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f: 'id': index,
writer = csv.writer(f, delimiter='\t') 'text': sentence,
writer.writerows(result) 'count': 10,
f.close() 'rawtxt': sentence_raw,
return ('data/' + randId + '.tsv') })
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')

@ -1,6 +1,15 @@
var tsvPath var tsvPath
var stopwords = [] var stopwords = []
const init = () => {
$(window).on('mousemove', function (e) {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
}
function clearStopWord() { function clearStopWord() {
stopwords = [] stopwords = []
$('#sweContainer').html('') $('#sweContainer').html('')
@ -86,22 +95,34 @@ function submit() {
function buildSentetree() { function buildSentetree() {
console.log("Build.") console.log("Build.")
var model; let model;
var tree; let tree;
var data; let data;
const graph = d3.tsv(tsvPath, buildTree); const graph = d3.json(tsvPath, buildTree);
function buildTree(error, rawdata) { function buildTree(error, rawdata) {
console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count })); const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
console.log({ data })
let minRatio = $('#minRatio').val()
let maxRatio = $('#maxRatio').val()
console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder() model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace) .tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token)) .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
.buildModel(data, { maxSupportRatio: 1 }); .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
tree = new SentenTree.SentenTreeVis('#vis', { tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40], fontSize: [15, 40],
gapBetweenGraph: 10 gapBetweenGraph: 10
}); });
tree.data(model.getRenderedGraphs(2)) tree.data(model.getRenderedGraphs(2))
.on('nodeMouseenter', (node) => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('<br>'))
})
.on('nodeMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
new ResizeSensor(jQuery('#d3kitRoot'), function () { new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -133,3 +154,5 @@ function countWords() {
console.log(wordCount) console.log(wordCount)
$("#wordcount").html('字數:' + wordCount) $("#wordcount").html('字數:' + wordCount)
} }
init()

@ -9,6 +9,9 @@
</head> </head>
<body> <body>
<div id="nodeTitle" class="nodeTitle hidden">
<div id="nodeTitleContent">test</div>
</div>
<div id="stopWordEditorLayer" class="info hidden"> <div id="stopWordEditorLayer" class="info hidden">
<div id="stopWordEditor"> <div id="stopWordEditor">
<h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4> <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
@ -39,13 +42,17 @@
<div id='heading'> <div id='heading'>
<h2>泛用文字視覺化工具</h2> <h2>泛用文字視覺化工具</h2>
<p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p> <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
<p id='comment'>這是泛用.txt檔視覺化工具,能夠簡單處理文字檔的視覺化。</p> <p id='comment'>這是泛用文字視覺化工具,能夠簡單處理文字檔的視覺化。</p>
<p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p> <p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p>
<p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p> <p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
</div> </div>
<div style="margin:10px;"> <div style="margin:10px;">
<button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
onclick="showStopwordEditor()">編輯停用詞</button> onclick="showStopwordEditor()">編輯停用詞</button>
<label>Min Ratio</label>
<input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
<label>Max Ratio</label>
<input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
</div> </div>
<div id='rawText' class=''> <div id='rawText' class=''>
<div id="wordcount">字數0</div> <div id="wordcount">字數0</div>

Loading…
Cancel
Save