加入ratio調整及滑鼠事件

master
zovjsra 2 years ago
parent 04d1eb79ba
commit 4ba0a7e901

@ -2,6 +2,7 @@ import jieba
import csv
import nltk
import re
import json
from jieba import posseg
from nltk import tokenize
from langdetect import detect
@ -31,38 +32,37 @@ def processText(randId, text, stopwords):
return ''
lang = detect(text)
sentenses = []
sentenses_raw = []
print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text)
print(splitted)
cutted = []
for i in splitted:
cutted.append(filterPOS(i))
for spl in splitted:
cutted.append(filterPOS(spl))
print(cutted)
for i in cutted:
result = []
for j in i:
if (j in stopwords):
continue
result.append(j)
if (len(result) >= 20):
sentenses.append(' '.join(result.copy()))
result = []
if (result != []):
sentenses.append(' '.join(result))
for spl, raw in zip(cutted, splitted):
sentenses.append(' '.join(spl))
sentenses_raw.append(raw)
else:
sentenses = []
for sentence in tokenize.sent_tokenize(text):
words = sentence.lower().split(' ')
sentenses_raw = []
for sentence_raw in tokenize.sent_tokenize(text):
words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
result = []
result.append(['id', 'text', 'count'])
for index, sentence in enumerate(sentenses):
result.append([index, sentence, 1000])
with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(result)
f.close()
return ('data/' + randId + '.tsv')
for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
sentence, sentence_raw = raw_pair
result.append({
'id': index,
'text': sentence,
'count': 10,
'rawtxt': sentence_raw,
})
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')

@ -1,6 +1,15 @@
var tsvPath
var stopwords = []
const init = () => {
$(window).on('mousemove', function (e) {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
}
function clearStopWord() {
stopwords = []
$('#sweContainer').html('')
@ -86,22 +95,34 @@ function submit() {
function buildSentetree() {
console.log("Build.")
var model;
var tree;
var data;
const graph = d3.tsv(tsvPath, buildTree);
let model;
let tree;
let data;
const graph = d3.json(tsvPath, buildTree);
function buildTree(error, rawdata) {
console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
console.log({ data })
let minRatio = $('#minRatio').val()
let maxRatio = $('#maxRatio').val()
console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
.buildModel(data, { maxSupportRatio: 1 });
.buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40],
gapBetweenGraph: 10
});
tree.data(model.getRenderedGraphs(2))
.on('nodeMouseenter', (node) => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('<br>'))
})
.on('nodeMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -133,3 +154,5 @@ function countWords() {
console.log(wordCount)
$("#wordcount").html('字數:' + wordCount)
}
init()

@ -9,6 +9,9 @@
</head>
<body>
<div id="nodeTitle" class="nodeTitle hidden">
<div id="nodeTitleContent">test</div>
</div>
<div id="stopWordEditorLayer" class="info hidden">
<div id="stopWordEditor">
<h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
@ -39,13 +42,17 @@
<div id='heading'>
<h2>泛用文字視覺化工具</h2>
<p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
<p id='comment'>這是泛用.txt檔視覺化工具,能夠簡單處理文字檔的視覺化。</p>
<p id='comment'>這是泛用文字視覺化工具,能夠簡單處理文字檔的視覺化。</p>
<p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p>
<p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
</div>
<div style="margin:10px;">
<button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
onclick="showStopwordEditor()">編輯停用詞</button>
<label>Min Ratio</label>
<input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
<label>Max Ratio</label>
<input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
</div>
<div id='rawText' class=''>
<div id="wordcount">字數0</div>

Loading…
Cancel
Save