From 4ba0a7e901a6927d19fc5f1da1839f0bc0d0802d Mon Sep 17 00:00:00 2001
From: zovjsra <110753121@nccu.edu.tw>
Date: Tue, 9 May 2023 07:01:08 +0000
Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5ratio=E8=AA=BF=E6=95=B4?=
=?UTF-8?q?=E5=8F=8A=E6=BB=91=E9=BC=A0=E4=BA=8B=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
app.py | 4 +-
generalText.py | 48 ++++++-------
static/js/generalText.js | 35 ++++++++--
static/js/ptt.js | 142 +++++++++++++++++++-------------------
templates/generalTxt.html | 9 ++-
5 files changed, 134 insertions(+), 104 deletions(-)
diff --git a/app.py b/app.py
index 5586bde..b0b9a99 100755
--- a/app.py
+++ b/app.py
@@ -6,8 +6,8 @@ from pprint import pprint
import threading
import random
import string
-#import dataHandlerPTT as ptt
-#import dataHandlerPTTPush as pttPush
+# import dataHandlerPTT as ptt
+# import dataHandlerPTTPush as pttPush
import generalText as gen
import json
diff --git a/generalText.py b/generalText.py
index 7c13b61..5038fd4 100644
--- a/generalText.py
+++ b/generalText.py
@@ -2,6 +2,7 @@ import jieba
import csv
import nltk
import re
+import json
from jieba import posseg
from nltk import tokenize
from langdetect import detect
@@ -27,42 +28,41 @@ def filterPOS(text):
def processText(randId, text, stopwords):
- if(text == ''):
+ if (text == ''):
return ''
lang = detect(text)
sentenses = []
+ sentenses_raw = []
print(lang)
if (lang == 'zh-cn' or lang == 'zh-tw' or lang == 'ko'):
splitted = re.split('。|[\n]+', text)
print(splitted)
cutted = []
- for i in splitted:
- cutted.append(filterPOS(i))
+ for spl in splitted:
+ cutted.append(filterPOS(spl))
print(cutted)
- for i in cutted:
- result = []
- for j in i:
- if (j in stopwords):
- continue
- result.append(j)
- if (len(result) >= 20):
- sentenses.append(' '.join(result.copy()))
- result = []
- if (result != []):
- sentenses.append(' '.join(result))
+ for spl, raw in zip(cutted, splitted):
+ sentenses.append(' '.join(spl))
+ sentenses_raw.append(raw)
else:
sentenses = []
- for sentence in tokenize.sent_tokenize(text):
- words = sentence.lower().split(' ')
+ sentenses_raw = []
+ for sentence_raw in tokenize.sent_tokenize(text):
+ words = sentence_raw.lower().split(' ')
print([''.join([a for a in w1 if a.isalpha()]) for w1 in words])
sentence = ' '.join([w for w in [''.join([a for a in w1 if a.isalpha()]) for w1 in words] if w not in [sw.lower() for sw in stopwords]])
sentenses.append(sentence)
+ sentenses_raw.append(sentence_raw)
result = []
- result.append(['id', 'text', 'count'])
- for index, sentence in enumerate(sentenses):
- result.append([index, sentence, 1000])
- with open('data/' + randId + '.tsv', 'w', newline='', encoding="utf-8") as f:
- writer = csv.writer(f, delimiter='\t')
- writer.writerows(result)
- f.close()
- return ('data/' + randId + '.tsv')
+ for index, raw_pair in enumerate(zip(sentenses, sentenses_raw)):
+ sentence, sentence_raw = raw_pair
+ result.append({
+ 'id': index,
+ 'text': sentence,
+ 'count': 10,
+ 'rawtxt': sentence_raw,
+ })
+
+ with open('data/' + randId + '.json', 'w', newline='', encoding="utf-8") as fp:
+ json.dump(result, fp, ensure_ascii=False, indent=4)
+ return ('data/' + randId + '.json')
diff --git a/static/js/generalText.js b/static/js/generalText.js
index 6d6de75..2ccc48e 100644
--- a/static/js/generalText.js
+++ b/static/js/generalText.js
@@ -1,6 +1,15 @@
var tsvPath
var stopwords = []
+const init = () => {
+ $(window).on('mousemove', function (e) {
+ $('#nodeTitle').css({
+ left: e.pageX,
+ top: e.pageY
+ })
+ })
+}
+
function clearStopWord() {
stopwords = []
$('#sweContainer').html('')
@@ -86,22 +95,34 @@ function submit() {
function buildSentetree() {
console.log("Build.")
- var model;
- var tree;
- var data;
- const graph = d3.tsv(tsvPath, buildTree);
+ let model;
+ let tree;
+ let data;
+ const graph = d3.json(tsvPath, buildTree);
function buildTree(error, rawdata) {
+ console.log(rawdata)
const data = rawdata.map(d => Object.assign({}, d, { count: +d.count }));
+ console.log({ data })
+ let minRatio = $('#minRatio').val()
+ let maxRatio = $('#maxRatio').val()
+ console.log({ minRatio, maxRatio })
model = new SentenTree.SentenTreeBuilder()
.tokenize(SentenTree.tokenizer.tokenizeBySpace)
.transformToken(token => (/score(d|s)?/.test(token) ? 'score' : token))
- .buildModel(data, { maxSupportRatio: 1 });
+ .buildModel(data, { maxSupportRatio: maxRatio, minSupportRatio: minRatio });
tree = new SentenTree.SentenTreeVis('#vis', {
fontSize: [15, 40],
gapBetweenGraph: 10
});
tree.data(model.getRenderedGraphs(2))
+ .on('nodeMouseenter', (node) => {
+ $('#nodeTitle').removeClass('hidden')
+ $('#nodeTitleContent').html(node.data.topEntries.map((n) => data[n.id].rawtxt).join('
'))
+ })
+ .on('nodeMouseleave', () => {
+ $('#nodeTitle').addClass('hidden')
+ })
new ResizeSensor(jQuery('#d3kitRoot'), function () {
var scale, origin;
scale = Math.min(2, ($('#graph').outerWidth()) / ($('#d3kitRoot').outerWidth() + 60))
@@ -132,4 +153,6 @@ function countWords() {
let wordCount = text.split(new RegExp("[\u4e00-\u9fa5]")).length - 1
console.log(wordCount)
$("#wordcount").html('字數:' + wordCount)
-}
\ No newline at end of file
+}
+
+init()
\ No newline at end of file
diff --git a/static/js/ptt.js b/static/js/ptt.js
index 235a60d..1ef541c 100755
--- a/static/js/ptt.js
+++ b/static/js/ptt.js
@@ -17,7 +17,7 @@ function init() {
type: 'POST',
url: '/init',
dataType: 'json',
- success: function(data) {
+ success: function (data) {
console.log(data)
setDate(data.Result.startDate, data.Result.endDate)
document.getElementById('keywordBox').value = data.Result.keyword
@@ -39,44 +39,44 @@ function init() {
buildSentetree()
}
})
- $(document).ready(function() {
- $(window).keydown(function(event) {
+ $(document).ready(function () {
+ $(window).keydown(function (event) {
if (event.keyCode == 13) {
event.preventDefault()
sendRequest()
}
});
});
- $(window).on('mousemove', function(e) {
+ $(window).on('mousemove', function (e) {
$('#nodeTitle').css({
left: e.pageX,
top: e.pageY
})
})
$('#titleListContainer').hover(
- function() { // Run on hover/mouseenter
+ function () { // Run on hover/mouseenter
$(this).css('overflow', 'auto')
},
- function() { // Run on mouseleave
+ function () { // Run on mouseleave
$(this).css('overflow', 'hidden')
}
)
- $('#titleListLayer').click(function(e) {
+ $('#titleListLayer').click(function (e) {
if ($('#titleListLayer').is(e.target)) {
hideTitles()
}
})
- $('#stopWordEditorLayer').click(function(e) {
+ $('#stopWordEditorLayer').click(function (e) {
if ($('#stopWordEditorLayer').is(e.target)) {
hideStopWordEditor()
}
})
- $('#idfEditorLayer').click(function(e) {
+ $('#idfEditorLayer').click(function (e) {
if ($('#idfEditorLayer').is(e.target)) {
hideIdfEditor()
}
})
- $('#pttPageWindow').click(function(e) {
+ $('#pttPageWindow').click(function (e) {
if ($('#pttPageWindow').is(e.target)) {
hidePTTPage()
}
@@ -88,23 +88,23 @@ function init() {
function loadTemplate(num) {
templates = [{
- startDate: '2020-12-01',
- endDate: '2020-12-31',
- keyword: '',
- mode: 1
- },
- {
- startDate: '2020-01-01',
- endDate: '2020-03-01',
- keyword: '衛生紙',
- mode: 2
- },
- {
- startDate: '2020-01-11',
- endDate: '2020-01-12',
- keyword: '',
- mode: 2
- }
+ startDate: '2020-12-01',
+ endDate: '2020-12-31',
+ keyword: '',
+ mode: 1
+ },
+ {
+ startDate: '2020-01-01',
+ endDate: '2020-03-01',
+ keyword: '衛生紙',
+ mode: 2
+ },
+ {
+ startDate: '2020-01-11',
+ endDate: '2020-01-12',
+ keyword: '',
+ mode: 2
+ }
]
chosenTemp = templates[num]
setDate(chosenTemp.startDate, chosenTemp.endDate)
@@ -127,7 +127,7 @@ function addStopWord() {
} else {
stopwords.push(newsw)
- $('#sweContainer').append($('