修改停用詞的處理方法

dev/addPush
Zovjsra 4 years ago
parent 8014aef8d1
commit ed2e045958

@ -124,7 +124,7 @@ def filterPOS(content, aid):
stopwords = content['stopwords'] stopwords = content['stopwords']
else: else:
stopwords = defaultStopWords stopwords = defaultStopWords
stopped = [i for i in cutted if i not in stopwords] stopped = [i for i in cutted] # 不在server端刪除停用詞
return stopped return stopped
@ -217,7 +217,7 @@ def findResult(content):
filtered.append(i) filtered.append(i)
titles['info']['posts'] = len(filtered) titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted( filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]] filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]]
print('到第一步為止生成花費', int(time()-timeStart), '') print('到第一步為止生成花費', int(time()-timeStart), '')
counter = 0 counter = 0
total = len(filtered) total = len(filtered)
@ -299,7 +299,7 @@ def getDefault(startDate, endDate):
filtered.append(i) filtered.append(i)
titles['info']['posts'] = len(filtered) titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted( filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]] filtered, key=lambda x: x['pushes'], reverse=True)[:30]]
counter = 0 counter = 0
total = len(postContents) total = len(postContents)
content = { content = {

@ -40,7 +40,7 @@ def possegPushes(ids: list, stopwords: list, keyword: str):
result = [] result = []
for index, p in enumerate(possegs): for index, p in enumerate(possegs):
words = [i[1] for i in p['content'] if i[0] not in [ words = [i[1] for i in p['content'] if i[0] not in [
'eng', 'x', 'm'] and i[1] not in stopwords] 'eng', 'x', 'm']]
if(keyword == '' or keyword in words): if(keyword == '' or keyword in words):
result.append({ result.append({
'posString': ' '.join(words), 'posString': ' '.join(words),

@ -18,4 +18,53 @@
完整 完整
新聞 新聞
標題 標題
內文 內文
可以
沒有
就是
自己
大家
我們
知道
網址
備註
連結
所以

@ -10,6 +10,7 @@ var wordTitleList
var randId var randId
var globKeyword = '' var globKeyword = ''
var stopwords = [] var stopwords = []
var tsvString
function init() { function init() {
$.ajax({ $.ajax({
@ -35,7 +36,7 @@ function init() {
$('#graphInfo').empty() $('#graphInfo').empty()
$('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + ',' + keywordCountString) $('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + ',' + keywordCountString)
totalPosts = json.info.posts totalPosts = json.info.posts
buildSentetree(tsvString) buildSentetree()
} }
}) })
$(document).ready(function() { $(document).ready(function() {
@ -102,7 +103,6 @@ function addStopWord() {
} }
function showStopwordEditor() { function showStopwordEditor() {
console.log(stopwords)
$(window).unbind('keydown') $(window).unbind('keydown')
$(window).keydown(function(event) { $(window).keydown(function(event) {
if (event.keyCode == 13) { if (event.keyCode == 13) {
@ -175,7 +175,7 @@ function sendRequest() {
startDate: $('#startDate').val(), startDate: $('#startDate').val(),
endDate: $('#endDate').val(), endDate: $('#endDate').val(),
keyword: $('#keywordBox').val(), keyword: $('#keywordBox').val(),
stopwords: stopwords, stopwords: [],
pos: { pos: {
noun: $('#noun').is(':checked'), noun: $('#noun').is(':checked'),
verb: $('#verb').is(':checked'), verb: $('#verb').is(':checked'),
@ -216,19 +216,21 @@ function changeGraph(data) {
$('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + keywordCountString) $('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + keywordCountString)
totalPosts = json.info.posts totalPosts = json.info.posts
destroyCurrentGraph() destroyCurrentGraph()
d3.select('#graph').append('div').attr('id', 'vis') buildSentetree()
buildSentetree(tsvString)
} }
function destroyCurrentGraph() { function destroyCurrentGraph() {
d3.selectAll('#vis').remove() d3.selectAll('#vis').remove()
d3.select('#graph').append('div').attr('id', 'vis')
} }
function hideTitles() { function hideTitles() {
$('#titleListLayer').addClass('hidden') $('#titleListLayer').addClass('hidden')
$('#setToKeyword').unbind()
$("#addToStopwords").unbind()
} }
function buildSentetree(tsvString) { function buildSentetree() {
console.log("Build.") console.log("Build.")
var model; var model;
var tree; var tree;
@ -253,10 +255,9 @@ function buildSentetree(tsvString) {
fontSize: [15, 40], fontSize: [15, 40],
gapBetweenGraph: 10 gapBetweenGraph: 10
}); });
console.log(tree)
tree.data(model.getRenderedGraphs(2)) tree.data(model.getRenderedGraphs(2))
.on('nodeClick', node => { .on('nodeClick', node => {
$("#keywordBox").val(node.data.entity)
$('#titleListLayer').removeClass('hidden')
seqList = node.data.seq.DBs.map(function(n) { seqList = node.data.seq.DBs.map(function(n) {
return n.rawText return n.rawText
}) })
@ -273,6 +274,27 @@ function buildSentetree(tsvString) {
info = wordTitleList[node.data.entity] info = wordTitleList[node.data.entity]
$('#titleListKeyword').html(node.data.entity) $('#titleListKeyword').html(node.data.entity)
$('#titleListKeywordInfo').html('') $('#titleListKeywordInfo').html('')
if (stopwords.indexOf(node.data.entity) < 0) {
$("#addToStopwords").html('設為停用詞').css('background-color', '#379').click(() => {
stopwords.push(node.data.entity)
destroyCurrentGraph()
buildSentetree()
hideTitles()
})
} else {
$("#addToStopwords").html('從停用詞移除').css('background-color', '#933').click(() => {
stopwords.pop(node.data.entity)
destroyCurrentGraph()
buildSentetree()
hideTitles()
})
}
$('#setToKeyword').click(() => {
$('#keywordBox').val(node.data.entity)
sendRequest()
hideTitles()
})
$('#titleListLayer').removeClass('hidden')
$.ajax({ $.ajax({
type: 'POST', type: 'POST',
url: '/ptt/keywordFrequency', url: '/ptt/keywordFrequency',

File diff suppressed because it is too large Load Diff

@ -19143,7 +19143,7 @@
words.forEach(function(w) { words.forEach(function(w) {
var value = fdist[w]; var value = fdist[w];
if (value < maxSupport && value > maxc) { if (value < maxSupport && value > maxc && (isNotRoot || stopwords.indexOf(itemset[w]) < 0)) {
maxw = +w; maxw = +w;
maxc = value; maxc = value;
} }
@ -19185,7 +19185,7 @@
} }
} }
return { word: word, pos: pos, count: count, s0: s0, s1: s1 }; return { word: word, pos: pos, count: stopwords.indexOf(itemset[word]) < 0 ? count : minSupport, s0: s0, s1: s1 };
} }
function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) { function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) {
@ -19344,6 +19344,7 @@
var graphs = []; var graphs = [];
var visibleGroups = expandSeqTree(this.rootSeq, graphs, DEFAULT_NODE_COUNT, minSupport, maxSupport, this.terms, itemset); var visibleGroups = expandSeqTree(this.rootSeq, graphs, DEFAULT_NODE_COUNT, minSupport, maxSupport, this.terms, itemset);
this.graphs = graphs.filter(function(g) { this.graphs = graphs.filter(function(g) {
return g.nodes.length > 2; return g.nodes.length > 2;
}).slice(0, 10); }).slice(0, 10);
@ -19376,9 +19377,11 @@
key: 'getRenderedGraphs', key: 'getRenderedGraphs',
value: function getRenderedGraphs(limit) { value: function getRenderedGraphs(limit) {
var graphs = arguments.length === 1 ? this.graphs.slice(0, limit) : this.graphs; var graphs = arguments.length === 1 ? this.graphs.slice(0, limit) : this.graphs;
console.log("slice")
var renderedGraphs = graphs.map(function(g) { var renderedGraphs = graphs.map(function(g) {
return g.toRenderedGraph(); return g.toRenderedGraph();
}); });
console.log("toRenderedGraph")
var globalFreqRange = [(0, _lodash.min)(renderedGraphs.map(function(g) { var globalFreqRange = [(0, _lodash.min)(renderedGraphs.map(function(g) {
return g.freqRange[0]; return g.freqRange[0];
})), (0, _lodash.max)(renderedGraphs.map(function(g) { })), (0, _lodash.max)(renderedGraphs.map(function(g) {
@ -37424,7 +37427,6 @@
}).filter(function(entry) { }).filter(function(entry) {
return entry.tokens.length > 0; return entry.tokens.length > 0;
}); });
return new _TokenizedDataset2.default(tokenizedEntries); return new _TokenizedDataset2.default(tokenizedEntries);
} }
}, { }, {
@ -37855,13 +37857,18 @@
heap.push(n); heap.push(n);
}); });
let counter = 1;
while (heap.size() > 0) { while (heap.size() > 0) {
console.log(`in while ${counter++}`)
var parent = heap.pop(); var parent = heap.pop();
console.log(heap)
if (parent.merged) { if (parent.merged) {
continue; continue;
} }
var groups = []; var groups = [];
console.log(parent.data.id)
if (parent.leftLinks.length > 1) { if (parent.leftLinks.length > 1) {
var lNodes = parent.leftLinks.map(function(l) { var lNodes = parent.leftLinks.map(function(l) {
return l.source; return l.source;
@ -37875,6 +37882,7 @@
}); });
groups = groups.concat(this.groupMergeableNodes(rNodes)); groups = groups.concat(this.groupMergeableNodes(rNodes));
} }
console.log(groups)
if (groups.length > 0) { if (groups.length > 0) {
var newNodes = groups.map(function(group) { var newNodes = groups.map(function(group) {
@ -38102,6 +38110,7 @@
var RenderedGraph = function() { var RenderedGraph = function() {
function RenderedGraph(rawGraph) { function RenderedGraph(rawGraph) {
console.log(arguments)
var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
_ref$bundle = _ref.bundle, _ref$bundle = _ref.bundle,
bundle = _ref$bundle === undefined ? true : _ref$bundle, bundle = _ref$bundle === undefined ? true : _ref$bundle,
@ -38134,8 +38143,10 @@
this.assignNodeIds(nodes); this.assignNodeIds(nodes);
console.log(bundle)
if (bundle) { if (bundle) {
var bundled = new _GraphBundler2.default(nodes, links).bundle(); var bundled = new _GraphBundler2.default(nodes, links).bundle();
console.log(bundled)
this.nodes = bundled.nodes; this.nodes = bundled.nodes;
this.links = bundled.links; this.links = bundled.links;
this.assignNodeIds(this.nodes); this.assignNodeIds(this.nodes);

@ -1,110 +1,110 @@
{ {
"_from": "sententree", "_from": "sententree",
"_id": "sententree@1.0.0", "_id": "sententree@1.0.0",
"_inBundle": false, "_inBundle": false,
"_integrity": "sha1-xIKf0Tmu5dgroRPaDRqjtR2vWas=", "_integrity": "sha1-xIKf0Tmu5dgroRPaDRqjtR2vWas=",
"_location": "/sententree", "_location": "/sententree",
"_phantomChildren": {}, "_phantomChildren": {},
"_requested": { "_requested": {
"type": "tag", "type": "tag",
"registry": true, "registry": true,
"raw": "sententree", "raw": "sententree",
"name": "sententree",
"escapedName": "sententree",
"rawSpec": "",
"saveSpec": null,
"fetchSpec": "latest"
},
"_requiredBy": [
"#USER",
"/"
],
"_resolved": "https://registry.npmjs.org/sententree/-/sententree-1.0.0.tgz",
"_shasum": "c4829fd139aee5d82ba113da0d1aa3b51daf59ab",
"_spec": "sententree",
"_where": "C:\\Users\\a5640\\OneDrive - National ChengChi University\\programming\\Python\\flask\\sententree\\static",
"author": "",
"bugs": {
"url": "https://github.com/twitter/sententree/issues"
},
"bundleDependencies": false,
"dependencies": {
"d3": "^4.4.1",
"d3kit": "^3.2.0",
"heap": "^0.2.6",
"lodash": "^4.17.4",
"webcola": "^3.3.0"
},
"deprecated": false,
"description": "A novel text visualization technique",
"devDependencies": {
"babel-core": "^6.22.1",
"babel-eslint": "^6.1.2",
"babel-loader": "^6.2.10",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-istanbul": "^2.0.1",
"babel-plugin-transform-object-assign": "^6.22.0",
"babel-preset-es2015": "^6.3.13",
"babel-preset-react": "^6.22.0",
"browser-sync": "~2.14.0",
"chai": "^3.5.0",
"eslint": "^2.9.0",
"eslint-config-airbnb": "^9.0.1",
"eslint-plugin-import": "^1.12.0",
"eslint-plugin-jsx-a11y": "^1.2.0",
"eslint-plugin-mocha": "^4.7.0",
"eslint-plugin-react": "^5.0.1",
"gh-pages": "^0.11.0",
"gulp": "^3.9.1",
"gulp-sass": "^3.1.0",
"karma": "~0.13.15",
"karma-chai": "^0.1.0",
"karma-coverage": "~0.5.3",
"karma-mocha": "^1.1.1",
"karma-mocha-reporter": "^2.1.0",
"karma-phantomjs-launcher": "^1.0.1",
"karma-webpack": "^2.0.2",
"mocha": "^3.0.2",
"node-sass": "^4.5.0",
"pkgfiles": "^2.3.0",
"react": "^15.4.2",
"react-d3kit": "^1.2.4",
"react-dom": "^15.4.2",
"uglifyjs": "^2.4.10",
"webpack": "^2.2.1"
},
"files": [
"src/**/*.*",
"dist/*.*"
],
"homepage": "https://github.com/twitter/sententree#readme",
"keywords": [
"sententree",
"visualization"
],
"license": "Apache-2.0",
"main": "dist/SentenTree.js",
"name": "sententree", "name": "sententree",
"escapedName": "sententree", "repository": {
"rawSpec": "", "type": "git",
"saveSpec": null, "url": "git+https://github.com/twitter/sententree.git"
"fetchSpec": "latest" },
}, "scripts": {
"_requiredBy": [ "build": "npm run build-lib && npm run build-standalone",
"#USER", "build-demo": "NODE_ENV=production webpack --config webpack.config.demo.js && gulp sass",
"/" "build-lib": "NODE_ENV=production webpack && uglifyjs dist/sententree.js -m -c > dist/sententree.min.js",
], "build-standalone": "NODE_ENV=production webpack --config webpack.config.standalone.js && uglifyjs dist/sententree-standalone.js -m -c > dist/sententree-standalone.min.js",
"_resolved": "https://registry.npmjs.org/sententree/-/sententree-1.0.0.tgz", "clean": "rm -rf dist",
"_shasum": "c4829fd139aee5d82ba113da0d1aa3b51daf59ab", "eslint": "eslint --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"_spec": "sententree", "eslint-fix": "eslint --fix --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"_where": "C:\\Users\\a5640\\OneDrive - National ChengChi University\\programming\\Python\\flask\\sententree\\static", "gh-pages": "npm run build-demo && gh-pages -d demo",
"author": "", "postversion": "git push ; git push --tags; pkgfiles",
"bugs": { "prepublish": "pkgfiles",
"url": "https://github.com/twitter/sententree/issues" "server": "gulp server",
}, "start": "webpack --watch --config webpack.config.demo.js & npm run server",
"bundleDependencies": false, "tdd": "NODE_ENV=test karma start",
"dependencies": { "test": "NODE_ENV=test karma start --single-run",
"d3": "^4.4.1", "version": "npm run build && git add -A dist"
"d3kit": "^3.2.0", },
"heap": "^0.2.6", "version": "1.0.0"
"lodash": "^4.17.4", }
"webcola": "^3.3.0"
},
"deprecated": false,
"description": "A novel text visualization technique",
"devDependencies": {
"babel-core": "^6.22.1",
"babel-eslint": "^6.1.2",
"babel-loader": "^6.2.10",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-istanbul": "^2.0.1",
"babel-plugin-transform-object-assign": "^6.22.0",
"babel-preset-es2015": "^6.3.13",
"babel-preset-react": "^6.22.0",
"browser-sync": "~2.14.0",
"chai": "^3.5.0",
"eslint": "^2.9.0",
"eslint-config-airbnb": "^9.0.1",
"eslint-plugin-import": "^1.12.0",
"eslint-plugin-jsx-a11y": "^1.2.0",
"eslint-plugin-mocha": "^4.7.0",
"eslint-plugin-react": "^5.0.1",
"gh-pages": "^0.11.0",
"gulp": "^3.9.1",
"gulp-sass": "^3.1.0",
"karma": "~0.13.15",
"karma-chai": "^0.1.0",
"karma-coverage": "~0.5.3",
"karma-mocha": "^1.1.1",
"karma-mocha-reporter": "^2.1.0",
"karma-phantomjs-launcher": "^1.0.1",
"karma-webpack": "^2.0.2",
"mocha": "^3.0.2",
"node-sass": "^4.5.0",
"pkgfiles": "^2.3.0",
"react": "^15.4.2",
"react-d3kit": "^1.2.4",
"react-dom": "^15.4.2",
"uglifyjs": "^2.4.10",
"webpack": "^2.2.1"
},
"files": [
"src/**/*.*",
"dist/*.*"
],
"homepage": "https://github.com/twitter/sententree#readme",
"keywords": [
"sententree",
"visualization"
],
"license": "Apache-2.0",
"main": "dist/SentenTree.min.js",
"name": "sententree",
"repository": {
"type": "git",
"url": "git+https://github.com/twitter/sententree.git"
},
"scripts": {
"build": "npm run build-lib && npm run build-standalone",
"build-demo": "NODE_ENV=production webpack --config webpack.config.demo.js && gulp sass",
"build-lib": "NODE_ENV=production webpack && uglifyjs dist/sententree.js -m -c > dist/sententree.min.js",
"build-standalone": "NODE_ENV=production webpack --config webpack.config.standalone.js && uglifyjs dist/sententree-standalone.js -m -c > dist/sententree-standalone.min.js",
"clean": "rm -rf dist",
"eslint": "eslint --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"eslint-fix": "eslint --fix --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"gh-pages": "npm run build-demo && gh-pages -d demo",
"postversion": "git push ; git push --tags; pkgfiles",
"prepublish": "pkgfiles",
"server": "gulp server",
"start": "webpack --watch --config webpack.config.demo.js & npm run server",
"tdd": "NODE_ENV=test karma start",
"test": "NODE_ENV=test karma start --single-run",
"version": "npm run build && git add -A dist"
},
"version": "1.0.0"
}

@ -5,141 +5,141 @@ import Link from './Link.js';
import Node from './Node.js'; import Node from './Node.js';
export default class RenderedGraph { export default class RenderedGraph {
constructor(rawGraph, { constructor(rawGraph, {
bundle = true, bundle = true,
highFrequencyOnTop = true, highFrequencyOnTop = true,
} = {}) { } = {}) {
this.options = { bundle, highFrequencyOnTop }; this.options = { bundle, highFrequencyOnTop };
this.minSupport = rawGraph.minSupport; this.minSupport = rawGraph.minSupport;
this.maxSupport = rawGraph.maxSupport; this.maxSupport = rawGraph.maxSupport;
const nodes = rawGraph.nodes.map(n => new Node(n)); const nodes = rawGraph.nodes.map(n => new Node(n));
const links = []; const links = [];
Object.keys(rawGraph.linkadj).forEach(l => { Object.keys(rawGraph.linkadj).forEach(l => {
const leftNode = nodes[l]; const leftNode = nodes[l];
const rights = rawGraph.linkadj[l]; const rights = rawGraph.linkadj[l];
Object.keys(rights).forEach(r => { Object.keys(rights).forEach(r => {
const rightNode = nodes[r]; const rightNode = nodes[r];
const link = new Link( const link = new Link(
leftNode, leftNode,
rightNode, rightNode,
rights[r] rights[r]
); );
links.push(link); links.push(link);
leftNode.rightLinks.push(link); leftNode.rightLinks.push(link);
rightNode.leftLinks.push(link); rightNode.leftLinks.push(link);
}); });
}); });
this.assignNodeIds(nodes); this.assignNodeIds(nodes);
if (bundle) { if (bundle) {
const bundled = new GraphBundler(nodes, links).bundle(); const bundled = new GraphBundler(nodes, links).bundle();
this.nodes = bundled.nodes; this.nodes = bundled.nodes;
this.links = bundled.links; this.links = bundled.links;
this.assignNodeIds(this.nodes); this.assignNodeIds(this.nodes);
} else { } else {
this.nodes = nodes; this.nodes = nodes;
this.links = links; this.links = links;
}
this.linkLookup = keyBy(this.links, l => l.getKey());
if (highFrequencyOnTop) {
this.nodes.forEach(n => {
n.rightLinks.sort((a, b) => b.freq - a.freq);
n.leftLinks.sort((a, b) => b.freq - a.freq);
});
}
const frequencies = this.nodes.map(n => n.data.freq);
this.freqRange = [min(frequencies), max(frequencies)];
this.globalFreqRange = this.freqRange;
const onlyBridgeConstraints = this.links
.filter(link => link.isTheOnlyBridge())
.map(link => link.toOnlyBridgeConstraint());
this.baseConstraints = onlyBridgeConstraints
.concat(this.getAlignmentConstraints());
} }
this.linkLookup = keyBy(this.links, l => l.getKey());
if (highFrequencyOnTop) { updateNodeSize(sizeFn) {
this.nodes.forEach(n => { this.nodes.forEach(node => {
n.rightLinks.sort((a, b) => b.freq - a.freq); const { width, height } = sizeFn(node);
n.leftLinks.sort((a, b) => b.freq - a.freq); node.width = width;
}); node.height = height;
});
return this;
} }
const frequencies = this.nodes.map(n => n.data.freq); assignNodeIds(nodes, startIndex = 0) {
this.freqRange = [min(frequencies), max(frequencies)]; nodes.forEach((n, i) => { n.id = i + startIndex; });
this.globalFreqRange = this.freqRange; return this;
const onlyBridgeConstraints = this.links
.filter(link => link.isTheOnlyBridge())
.map(link => link.toOnlyBridgeConstraint());
this.baseConstraints = onlyBridgeConstraints
.concat(this.getAlignmentConstraints());
}
updateNodeSize(sizeFn) {
this.nodes.forEach(node => {
const { width, height } = sizeFn(node);
node.width = width;
node.height = height;
});
return this;
}
assignNodeIds(nodes, startIndex = 0) {
nodes.forEach((n, i) => { n.id = i + startIndex; });
return this;
}
getAlignmentConstraints() {
const alignmentConstraints = [];
if (this.nodes.length > 0) {
const visitedNodes = this.nodes.map(() => false);
let queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeRightConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const rNodes = node.getRightNodes();
if (rNodes.length > 0) {
queue = queue.concat(rNodes);
}
}
for (let i = 0; i < this.nodes.length; i++) {
visitedNodes[i] = false;
}
queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeLeftConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const lNodes = node.getLeftNodes();
if (lNodes.length > 0) {
queue = queue.concat(lNodes);
}
}
} }
return alignmentConstraints; getAlignmentConstraints() {
} const alignmentConstraints = [];
if (this.nodes.length > 0) {
const visitedNodes = this.nodes.map(() => false);
let queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeRightConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const rNodes = node.getRightNodes();
if (rNodes.length > 0) {
queue = queue.concat(rNodes);
}
}
for (let i = 0; i < this.nodes.length; i++) {
visitedNodes[i] = false;
}
queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeLeftConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const lNodes = node.getLeftNodes();
if (lNodes.length > 0) {
queue = queue.concat(lNodes);
}
}
}
return alignmentConstraints;
}
getLinkConstraints() { getLinkConstraints() {
return this.links.map(l => l.toConstraint()); return this.links.map(l => l.toConstraint());
} }
getConstraints() { getConstraints() {
const constraints = this.baseConstraints const constraints = this.baseConstraints
.concat(this.links.map(l => l.toConstraint())); .concat(this.links.map(l => l.toConstraint()));
return this.options.highFrequencyOnTop return this.options.highFrequencyOnTop ?
? constraints.concat(flatMap(this.nodes, n => n.computeOrderConstraints())) constraints.concat(flatMap(this.nodes, n => n.computeOrderConstraints())) :
: constraints; constraints;
} }
toGroupConstraint() { toGroupConstraint() {
return { return {
leaves: this.nodes.map(n => n.id), leaves: this.nodes.map(n => n.id),
}; };
} }
} }

@ -6,278 +6,278 @@ import RawGraph from './RawGraph.js';
const DEFAULT_NODE_COUNT = 150; const DEFAULT_NODE_COUNT = 150;
function growSeq(seq, terms, minSupport, maxSupport, itemset) { function growSeq(seq, terms, minSupport, maxSupport, itemset) {
/* find the next frequent sequence by inserting a new word to current sequence */ /* find the next frequent sequence by inserting a new word to current sequence */
let pos = -1; let pos = -1;
let word = null; let word = null;
let count = 0; let count = 0;
const len = seq.words.length; const len = seq.words.length;
for (let s = 0; s <= len; s++) { for (let s = 0; s <= len; s++) {
const fdist = {}; const fdist = {};
seq.DBs.forEach(t => { seq.DBs.forEach(t => {
const l = s === 0 ? 0 : t.seqIndices[s - 1] + 1; const l = s === 0 ? 0 : t.seqIndices[s - 1] + 1;
const r = s === len ? t.tokens.length : t.seqIndices[s]; const r = s === len ? t.tokens.length : t.seqIndices[s];
const duplicate = {}; const duplicate = {};
for (let i = l; i < r; i++) { for (let i = l; i < r; i++) {
const w = t.tokens[i]; const w = t.tokens[i];
if (duplicate[w]) continue; if (duplicate[w]) continue;
duplicate[w] = true; duplicate[w] = true;
if (w in fdist) { if (w in fdist) {
fdist[w] += t.count; fdist[w] += t.count;
} else { } else {
fdist[w] = t.count; fdist[w] = t.count;
}
}
});
let maxw = null;
let maxc = 0;
const isNotRoot = len > 0;
const words = isNotRoot ?
Object.keys(fdist) :
Object.keys(fdist).filter(w => !itemset[w].startsWith('#'));
words.forEach(w => {
const value = fdist[w];
if (value < maxSupport && value > maxc) {
maxw = +w;
maxc = value;
}
});
if (maxc > count) {
pos = s;
word = maxw;
count = maxc;
} }
}
});
let maxw = null;
let maxc = 0;
const isNotRoot = len > 0;
const words = isNotRoot
? Object.keys(fdist)
: Object.keys(fdist).filter(w => !itemset[w].startsWith('#'));
words.forEach(w => {
const value = fdist[w];
if (value < maxSupport && value > maxc) {
maxw = +w;
maxc = value;
}
});
if (maxc > count) {
pos = s;
word = maxw;
count = maxc;
} }
}
let s0 = null;
let s0 = null; let s1 = null;
let s1 = null;
/* split the current group in two */
/* split the current group in two */ if (count >= minSupport) {
if (count >= minSupport) { s0 = { size: 0, DBs: [] };
s0 = { size: 0, DBs: [] }; s1 = { size: 0, DBs: [] };
s1 = { size: 0, DBs: [] }; const words = seq.words;
const words = seq.words; for (let ti = 0; ti < seq.DBs.length; ti++) {
for (let ti = 0; ti < seq.DBs.length; ti ++) { const t = seq.DBs[ti];
const t = seq.DBs[ti]; const l = pos === 0 ? 0 : t.seqIndices[pos - 1] + 1;
const l = pos === 0 ? 0 : t.seqIndices[pos - 1] + 1; const r = pos === words.length ? t.tokens.length : t.seqIndices[pos];
const r = pos === words.length ? t.tokens.length : t.seqIndices[pos]; let i = t.tokens.slice(l, r).indexOf(word);
let i = t.tokens.slice(l, r).indexOf(word); if (i < 0) {
if (i < 0) { s0.DBs.push(t);
s0.DBs.push(t); s0.size += t.count;
s0.size += t.count; } else {
} else { i += l;
i += l; t.seqIndices.splice(pos, 0, i);
t.seqIndices.splice(pos, 0, i); s1.DBs.push(t);
s1.DBs.push(t); s1.size += t.count;
s1.size += t.count; }
} }
} }
}
return { word, pos, count, s0, s1 }; return { word, pos, count, s0, s1 };
} }
function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) { function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) {
if (rootSeq.words && rootSeq.words.length > 0) { if (rootSeq.words && rootSeq.words.length > 0) {
rootSeq.graph.nodes = rootSeq.graph.nodes.concat(rootSeq.words); rootSeq.graph.nodes = rootSeq.graph.nodes.concat(rootSeq.words);
expandCnt -= rootSeq.words.length; expandCnt -= rootSeq.words.length;
}
/* Create a max heap */
const seqs = new Heap((a, b) => b.size - a.size);
seqs.push(rootSeq);
const leafSeqs = [];
while (!seqs.empty() && expandCnt > 0) {
/* find the candidate sequence with largest support DB */
const s = seqs.pop();
let graph = s.graph;
let s0 = s.r;
let s1 = s.l;
if (!s0 && !s1) {
/* find the next frequent sequence */
const result = growSeq(s, terms, minSupport, maxSupport, itemset);
s0 = result.s0;
s1 = result.s1;
const { word, pos, count } = result;
if (count < minSupport) {
leafSeqs.push(s);
} else {
/* create new sequences and add new word */
if (!graph) {
graph = new RawGraph(minSupport, maxSupport);
graphs.push(graph);
}
const newWord = {
id: graph.totalNodeCnt++,
entity: itemset[word],
freq: count,
topEntries: s1.DBs.slice(0, 5),
seq: s1,
};
const newWords = s.words.slice();
newWords.splice(pos, 0, newWord);
s0.words = s.words;
s1.words = newWords;
s1.newWord = newWord;
s0.graph = s.graph;
s1.graph = graph;
}
} }
if (s1) { /* Create a max heap */
s1.graph.nodes.push(s1.newWord); const seqs = new Heap((a, b) => b.size - a.size);
expandCnt--; seqs.push(rootSeq);
} const leafSeqs = [];
while (!seqs.empty() && expandCnt > 0) {
/* find the candidate sequence with largest support DB */
const s = seqs.pop();
let graph = s.graph;
let s0 = s.r;
let s1 = s.l;
if (!s0 && !s1) {
/* find the next frequent sequence */
const result = growSeq(s, terms, minSupport, maxSupport, itemset);
s0 = result.s0;
s1 = result.s1;
const { word, pos, count } = result;
if (count < minSupport) {
leafSeqs.push(s);
} else {
/* create new sequences and add new word */
if (!graph) {
graph = new RawGraph(minSupport, maxSupport);
graphs.push(graph);
}
const newWord = {
id: graph.totalNodeCnt++,
entity: itemset[word],
freq: count,
topEntries: s1.DBs.slice(0, 5),
seq: s1,
};
const newWords = s.words.slice();
newWords.splice(pos, 0, newWord);
s0.words = s.words;
s1.words = newWords;
s1.newWord = newWord;
s0.graph = s.graph;
s1.graph = graph;
}
}
/* add new sequences to seqTree */ if (s1) {
s.l = s1; s1.graph.nodes.push(s1.newWord);
s.r = s0; expandCnt--;
}
/* add new sequences to candidates */ /* add new sequences to seqTree */
if (s1) { s.l = s1;
seqs.push(s1); s.r = s0;
}
if (s0 && s0.size >= minSupport) { /* add new sequences to candidates */
seqs.push(s0); if (s1) {
seqs.push(s1);
}
if (s0 && s0.size >= minSupport) {
seqs.push(s0);
}
} }
}
return leafSeqs.concat(seqs.toArray()); return leafSeqs.concat(seqs.toArray());
} }
function updateNodesEdges(graphs, leafSeqs) { function updateNodesEdges(graphs, leafSeqs) {
leafSeqs leafSeqs
.filter(seq => graphs.indexOf(seq.graph) >= 0) .filter(seq => graphs.indexOf(seq.graph) >= 0)
.forEach(seq => { .forEach(seq => {
const words = seq.words; const words = seq.words;
const linkadj = seq.graph.linkadj; const linkadj = seq.graph.linkadj;
// printSeq(seq); // printSeq(seq);
for (let i = 0; i < words.length - 1; i++) { for (let i = 0; i < words.length - 1; i++) {
const word = words[i]; const word = words[i];
const id = word.id; const id = word.id;
const nextId = words[i + 1].id; const nextId = words[i + 1].id;
if (!(id in linkadj)) linkadj[id] = {}; if (!(id in linkadj)) linkadj[id] = {};
if (nextId in linkadj[id]) { if (nextId in linkadj[id]) {
linkadj[id][nextId] += seq.size; linkadj[id][nextId] += seq.size;
} else { } else {
linkadj[id][nextId] = seq.size; linkadj[id][nextId] = seq.size;
} }
} }
words words
.filter(word => !word.leafSeq || word.leafSeq < seq.size) .filter(word => !word.leafSeq || word.leafSeq < seq.size)
.forEach(word => { word.leafSeq = seq; }); .forEach(word => { word.leafSeq = seq; });
}); });
} }
function printSeq(words) { function printSeq(words) {
const str = words.map(w => w.entity).join(' '); const str = words.map(w => w.entity).join(' ');
console.log(str); console.log(str);
} }
export default class SentenTreeModel { export default class SentenTreeModel {
constructor(tokenizedData, options = {}) { constructor(tokenizedData, options = {}) {
// extract options // extract options
const { const {
termWeights = {}, termWeights = {},
// minimum support is the max of // minimum support is the max of
// minSupportCount // minSupportCount
// and size * minSupportRatio // and size * minSupportRatio
minSupportCount = 2, minSupportCount = 2,
minSupportRatio = 0.001, minSupportRatio = 0.001,
maxSupportRatio = 0.75, maxSupportRatio = 0.75,
} = options; } = options;
this.options = options; this.options = options;
const { itemset, entries } = tokenizedData; const { itemset, entries } = tokenizedData;
this.tokenizedData = tokenizedData; this.tokenizedData = tokenizedData;
this.terms = tokenizedData.encodeTermWeights(termWeights); this.terms = tokenizedData.encodeTermWeights(termWeights);
const size = tokenizedData.computeSize(); const size = tokenizedData.computeSize();
this.supportRange = [ this.supportRange = [
Math.max(size * minSupportRatio, minSupportCount), Math.max(size * minSupportRatio, minSupportCount),
size * maxSupportRatio, size * maxSupportRatio,
]; ];
const [minSupport, maxSupport] = this.supportRange; const [minSupport, maxSupport] = this.supportRange;
this.rootSeq = { this.rootSeq = {
words: [], words: [],
newWord: null, newWord: null,
graph: null, graph: null,
size, size,
DBs: entries, DBs: entries,
}; };
const graphs = [];
const visibleGroups = expandSeqTree(
this.rootSeq,
graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
itemset
);
this.graphs = graphs
.filter(g => g.nodes.length > 2)
.slice(0, 10);
updateNodesEdges(this.graphs, visibleGroups);
}
updateGraphs(newRootSeq) {
this.graphs.forEach(g => g.clear());
const rootSeq = newRootSeq || this.rootSeq;
const [minSupport, maxSupport] = this.supportRange;
const visibleGroups = expandSeqTree(
rootSeq,
this.graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
this.tokenizedData.itemset
);
updateNodesEdges(this.graphs, visibleGroups);
return this;
}
size() {
return this.rootSeq.size;
}
getRenderedGraphs(limit) {
const graphs = arguments.length === 1
? this.graphs.slice(0, limit)
: this.graphs;
const renderedGraphs = graphs.map(g => g.toRenderedGraph());
const globalFreqRange = [
min(renderedGraphs.map(g => g.freqRange[0])),
max(renderedGraphs.map(g => g.freqRange[1])),
];
let idPool = 0;
renderedGraphs.forEach(g => {
g.globalFreqRange = globalFreqRange;
g.nodes.forEach(n => {
n.gid = idPool;
idPool++;
});
});
return renderedGraphs;
}
} const graphs = [];
const visibleGroups = expandSeqTree(
this.rootSeq,
graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
itemset
);
this.graphs = graphs
.filter(g => g.nodes.length > 2)
.slice(0, 10);
updateNodesEdges(this.graphs, visibleGroups);
}
updateGraphs(newRootSeq) {
this.graphs.forEach(g => g.clear());
const rootSeq = newRootSeq || this.rootSeq;
const [minSupport, maxSupport] = this.supportRange;
const visibleGroups = expandSeqTree(
rootSeq,
this.graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
this.tokenizedData.itemset
);
updateNodesEdges(this.graphs, visibleGroups);
return this;
}
size() {
return this.rootSeq.size;
}
getRenderedGraphs(limit) {
const graphs = arguments.length === 1 ?
this.graphs.slice(0, limit) :
this.graphs;
const renderedGraphs = graphs.map(g => g.toRenderedGraph());
const globalFreqRange = [
min(renderedGraphs.map(g => g.freqRange[0])),
max(renderedGraphs.map(g => g.freqRange[1])),
];
let idPool = 0;
renderedGraphs.forEach(g => {
g.globalFreqRange = globalFreqRange;
g.nodes.forEach(n => {
n.gid = idPool;
idPool++;
});
});
return renderedGraphs;
}
}

@ -41,6 +41,8 @@
<ul id="titleListContainer" class="w3-ul w3-hoverable"></ul> <ul id="titleListContainer" class="w3-ul w3-hoverable"></ul>
<div id="backButton" style="margin: 20px 0px;"> <div id="backButton" style="margin: 20px 0px;">
<button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideTitles()">返回</button> <button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideTitles()">返回</button>
<button class="general-button" type="button" id="setToKeyword" style="background-color: #379; margin: 0px 20px">設為關鍵詞</button>
<button class="general-button" type='button' id='addToStopwords' style='background-color: #379;margin: 0px 40px;position: absolute;right: 0px;'></button>
</div> </div>
</div> </div>
</div> </div>

Loading…
Cancel
Save