加入ratio調整及滑鼠事件

master
zovjsra 2 years ago
parent 04d1eb79ba
commit 4ba0a7e901

@ -6,8 +6,8 @@ from pprint import pprint
import threading import threading
import random import random
import string import string
#import dataHandlerPTT as ptt # import dataHandlerPTT as ptt
#import dataHandlerPTTPush as pttPush # import dataHandlerPTTPush as pttPush
import generalText as gen import generalText as gen
import json import json

@ -2,6 +2,7 @@ import jieba
import csv import csv
import nltk import nltk
import re import re
import json
from jieba import posseg from jieba import posseg
from nltk import tokenize from nltk import tokenize
from langdetect import detect from langdetect import detect
@ -27,42 +28,41 @@ def filterPOS(text):
def processText(randId, text, stopwords): def processText(randId, text, stopwords):
if(text == ''): if (text == ''):
return '' return ''
lang = detect(text) lang = detect(text)
sentenses = [] sentenses = []
sentenses_raw = []
print(lang) print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'): if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text) splitted = re.split('。|[\n]+', text)
print(splitted) print(splitted)
cutted = [] cutted = []
for i in splitted: for spl in splitted:
cutted.append(filterPOS(i)) cutted.append(filterPOS(spl))
print(cutted) print(cutted)
for i in cutted: for spl, raw in zip(cutted, splitted):
result = [] sentenses.append(' '.join(spl))
for j in i: sentenses_raw.append(raw)
if (j in stopwords):
continue
result.append(j)
if (len(result) >= 20):
sentenses.append(' '.join(result.copy()))
result = []
if (result != []):
sentenses.append(' '.join(result))
else: else:
sentenses = [] sentenses = []
for sentence in tokenize.sent_tokenize(text): sentenses_raw = []
words = sentence.lower().split(' ') for sentence_raw in tokenize.sent_tokenize(text):
words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words]) print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]]) sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence) sentenses.append(sentence)
sentenses_raw.append(sentence_raw)
result = [] result = []
result.append(['id', 'text', 'count']) for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
for index, sentence in enumerate(sentenses): sentence, sentence_raw = raw_pair
result.append([index, sentence, 1000]) result.append({
with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f: 'id': index,
writer = csv.writer(f, delimiter='\t') 'text': sentence,
writer.writerows(result) 'count': 10,
f.close() 'rawtxt': sentence_raw,
return ('data/' + randId + '.tsv') })
with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
json.dump(result, fp, ensure_ascii=False, indent=4)
return ('data/' + randId + '.json')

@ -1,6 +1,15 @@
var tsvPath var tsvPath
var stopwords = [] var stopwords = []
const init = () => {
$(window).on('mousemove', function (e) {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
}
function clearStopWord() { function clearStopWord() {
stopwords = [] stopwords = []
$('#sweContainer').html('') $('#sweContainer').html('')
@ -86,22 +95,34 @@ function submit() {
function buildSentetree() { function buildSentetree() {
console.log("Build.") console.log("Build.")
var model; let model;
var tree; let tree;
var data; let data;
const graph = d3.tsv(tsvPath, buildTree); const graph = d3.json(tsvPath, buildTree);
function buildTree(error, rawdata) { function buildTree(error, rawdata) {
console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count })); const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
console.log({ data })
let minRatio = $('#minRatio').val()
let maxRatio = $('#maxRatio').val()
console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder() model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace) .tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token)) .transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
.buildModel(data, { maxSupportRatio: 1 }); .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
tree = new SentenTree.SentenTreeVis('#vis', { tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40], fontSize: [15, 40],
gapBetweenGraph: 10 gapBetweenGraph: 10
}); });
tree.data(model.getRenderedGraphs(2)) tree.data(model.getRenderedGraphs(2))
.on('nodeMouseenter', (node) => {
$('#nodeTitle').removeClass('hidden')
$('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('<br>'))
})
.on('nodeMouseleave', () => {
$('#nodeTitle').addClass('hidden')
})
new ResizeSensor(jQuery('#d3kitRoot'), function () { new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@ -133,3 +154,5 @@ function countWords() {
console.log(wordCount) console.log(wordCount)
$("#wordcount").html('字數:' + wordCount) $("#wordcount").html('字數:' + wordCount)
} }
init()

@ -17,7 +17,7 @@ function init() {
type: 'POST', type: 'POST',
url: '/init', url: '/init',
dataType: 'json', dataType: 'json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
setDate(data.Result.startDate, data.Result.endDate) setDate(data.Result.startDate, data.Result.endDate)
document.getElementById('keywordBox').value = data.Result.keyword document.getElementById('keywordBox').value = data.Result.keyword
@ -39,44 +39,44 @@ function init() {
buildSentetree() buildSentetree()
} }
}) })
$(document).ready(function() { $(document).ready(function () {
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
} }
}); });
}); });
$(window).on('mousemove', function(e) { $(window).on('mousemove', function (e) {
$('#nodeTitle').css({ $('#nodeTitle').css({
left: e.pageX, left: e.pageX,
top: e.pageY top: e.pageY
}) })
}) })
$('#titleListContainer').hover( $('#titleListContainer').hover(
function() { // Run on hover/mouseenter function () { // Run on hover/mouseenter
$(this).css('overflow', 'auto') $(this).css('overflow', 'auto')
}, },
function() { // Run on mouseleave function () { // Run on mouseleave
$(this).css('overflow', 'hidden') $(this).css('overflow', 'hidden')
} }
) )
$('#titleListLayer').click(function(e) { $('#titleListLayer').click(function (e) {
if ($('#titleListLayer').is(e.target)) { if ($('#titleListLayer').is(e.target)) {
hideTitles() hideTitles()
} }
}) })
$('#stopWordEditorLayer').click(function(e) { $('#stopWordEditorLayer').click(function (e) {
if ($('#stopWordEditorLayer').is(e.target)) { if ($('#stopWordEditorLayer').is(e.target)) {
hideStopWordEditor() hideStopWordEditor()
} }
}) })
$('#idfEditorLayer').click(function(e) { $('#idfEditorLayer').click(function (e) {
if ($('#idfEditorLayer').is(e.target)) { if ($('#idfEditorLayer').is(e.target)) {
hideIdfEditor() hideIdfEditor()
} }
}) })
$('#pttPageWindow').click(function(e) { $('#pttPageWindow').click(function (e) {
if ($('#pttPageWindow').is(e.target)) { if ($('#pttPageWindow').is(e.target)) {
hidePTTPage() hidePTTPage()
} }
@ -127,7 +127,7 @@ function addStopWord() {
} else { } else {
stopwords.push(newsw) stopwords.push(newsw)
$('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').attr('class', 'w3-display-container').append($('<span>').append(newsw)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -172,14 +172,14 @@ function scrollIdfList() {
function showStopwordEditor() { function showStopwordEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
addStopWord() addStopWord()
} }
}) })
$('#sweContainer').empty() $('#sweContainer').empty()
for (word of stopwords) { for (word of stopwords) {
$('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function(e) { $('#sweContainer').append($('<li>').append($('<span>').append(word)).append($('<span>').attr('class', 'w3-button w3-hover-red w3-transparent w3-display-right').click(function (e) {
var index = $(this).parent().index() var index = $(this).parent().index()
console.log(stopwords[index]) console.log(stopwords[index])
stopwords.splice(index, 1) stopwords.splice(index, 1)
@ -192,7 +192,7 @@ function showStopwordEditor() {
function showIdfEditor() { function showIdfEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
scrollIdfList() scrollIdfList()
} }
@ -226,7 +226,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('設為最小') .html('設為最小')
.click(function() { .click(function () {
$(this) $(this)
.parent() .parent()
.parent() .parent()
@ -236,7 +236,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('設為最大') .html('設為最大')
.click(function() { .click(function () {
$(this) $(this)
.parent() .parent()
.parent() .parent()
@ -248,7 +248,7 @@ function showIdfEditor() {
.append($('<button>') .append($('<button>')
.attr('class', 'general-button') .attr('class', 'general-button')
.html('重設') .html('重設')
.click(function() { .click(function () {
var _word = $($(this) var _word = $($(this)
.parent() .parent()
.parent() .parent()
@ -275,7 +275,7 @@ function showIdfEditor() {
function hideStopWordEditor() { function hideStopWordEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
@ -286,7 +286,7 @@ function hideStopWordEditor() {
function hideIdfEditor() { function hideIdfEditor() {
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function (event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
event.preventDefault() event.preventDefault()
sendRequest() sendRequest()
@ -369,7 +369,7 @@ function sendRequest() {
url: '/addRequest', url: '/addRequest',
data: content, data: content,
contentType: 'application/json', contentType: 'application/json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
changeGraph(data.Result) changeGraph(data.Result)
} }
@ -460,14 +460,14 @@ function buildSentetree() {
}) })
}) })
} else { } else {
seqList = node.data.seq.DBs.map(function(n) { seqList = node.data.seq.DBs.map(function (n) {
return n.rawText return n.rawText
}) })
} }
titleList = [] titleList = []
for (s of seqList) { for (s of seqList) {
titleTemp = wordTitleList[s] titleTemp = wordTitleList[s]
if ((titleList.map(function(n) { if ((titleList.map(function (n) {
return n.title return n.title
})).indexOf(titleTemp.title) == -1) { })).indexOf(titleTemp.title) == -1) {
titleList.push(titleTemp) titleList.push(titleTemp)
@ -508,7 +508,7 @@ function buildSentetree() {
globKeyword: globKeyword globKeyword: globKeyword
}), }),
contentType: 'application/json', contentType: 'application/json',
success: function(data) { success: function (data) {
console.log(data) console.log(data)
$('#titleListKeywordInfo').html('單詞出現次數:' + data.Result.wordCount + ', 單詞出現的文章數:' + data.Result.postCount + ', 單詞頻率:' + (data.Result.postCount * 100 / totalPosts).toFixed(2) + '%') $('#titleListKeywordInfo').html('單詞出現次數:' + data.Result.wordCount + ', 單詞出現的文章數:' + data.Result.postCount + ', 單詞頻率:' + (data.Result.postCount * 100 / totalPosts).toFixed(2) + '%')
} }
@ -527,7 +527,7 @@ function buildSentetree() {
).append( ).append(
$('<span>').attr('style', 'margin: 0px 10px').html('推文數:' + i.pushes) $('<span>').attr('style', 'margin: 0px 10px').html('推文數:' + i.pushes)
) )
).click(function() { ).click(function () {
let indx = $(this).index() let indx = $(this).index()
showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc')) showPTTPage((titleList[indx].url).replace('www.ptt.cc', 'www.pttweb.cc'))
}) })
@ -536,7 +536,7 @@ function buildSentetree() {
}) })
.on('nodeMouseenter', node => { .on('nodeMouseenter', node => {
console.log(node) console.log(node)
titles = node.data.topEntries.map(function(x) { titles = node.data.topEntries.map(function (x) {
return wordTitleList[x.rawText] return wordTitleList[x.rawText]
}) })
console.log(titles) console.log(titles)
@ -564,7 +564,7 @@ function buildSentetree() {
}).on('linkMouseenter', link => { }).on('linkMouseenter', link => {
console.log(link) console.log(link)
}) })
new ResizeSensor(jQuery('#d3kitRoot'), function() { new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin; var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60)) scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))

@ -9,6 +9,9 @@
</head> </head>
<body> <body>
<div id="nodeTitle" class="nodeTitle hidden">
<div id="nodeTitleContent">test</div>
</div>
<div id="stopWordEditorLayer" class="info hidden"> <div id="stopWordEditorLayer" class="info hidden">
<div id="stopWordEditor"> <div id="stopWordEditor">
<h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4> <h4 id="sweTitle" style="margin:10px; display: inline;">編輯停用詞</h4>
@ -39,13 +42,17 @@
<div id='heading'> <div id='heading'>
<h2>泛用文字視覺化工具</h2> <h2>泛用文字視覺化工具</h2>
<p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p> <p>SentenTree <a href="https://github.com/twitter/SentenTree">https://github.com/twitter/SentenTree</a></p>
<p id='comment'>這是泛用.txt檔視覺化工具,能夠簡單處理文字檔的視覺化。</p> <p id='comment'>這是泛用文字視覺化工具,能夠簡單處理文字檔的視覺化。</p>
<p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p> <p id='comment'>支援的語言:繁體中文、英文以及所有使用空格分詞的語言。</p>
<p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p> <p id='comment'>使用繁體中文Jieba斷詞器不保證簡體中文能夠正常使用。</p>
</div> </div>
<div style="margin:10px;"> <div style="margin:10px;">
<button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;" <button class="general-button" type="button" id="editSWButton" style="margin:10px 0px;"
onclick="showStopwordEditor()">編輯停用詞</button> onclick="showStopwordEditor()">編輯停用詞</button>
<label>Min Ratio</label>
<input type="number" step="0.0001" id="minRatio" value="0.001" min="0.0001" max="1">
<label>Max Ratio</label>
<input type="number" step="0.0001" id="maxRatio" value="1" min="0.0001" max="1">
</div> </div>
<div id='rawText' class=''> <div id='rawText' class=''>
<div id="wordcount">字數0</div> <div id="wordcount">字數0</div>

Loading…
Cancel
Save