修改停用詞的處理方法

dev/addPush
Zovjsra 4 years ago
parent 8014aef8d1
commit ed2e045958

@ -124,7 +124,7 @@ def filterPOS(content, aid):
stopwords = content['stopwords']
else:
stopwords = defaultStopWords
stopped = [i for i in cutted if i not in stopwords]
stopped = [i for i in cutted] # 不在server端刪除停用詞
return stopped
@ -217,7 +217,7 @@ def findResult(content):
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
filtered, key=lambda x: x['pushes'], reverse=True)[:(30 if (content['keyword'] == "") else 100)]]
print('到第一步為止生成花費', int(time()-timeStart), '')
counter = 0
total = len(filtered)
@ -299,7 +299,7 @@ def getDefault(startDate, endDate):
filtered.append(i)
titles['info']['posts'] = len(filtered)
filtered = [i for i in sorted(
filtered, key=lambda x: x['pushes'], reverse=True)[:50]]
filtered, key=lambda x: x['pushes'], reverse=True)[:30]]
counter = 0
total = len(postContents)
content = {

@ -40,7 +40,7 @@ def possegPushes(ids: list, stopwords: list, keyword: str):
result = []
for index, p in enumerate(possegs):
words = [i[1] for i in p['content'] if i[0] not in [
'eng', 'x', 'm'] and i[1] not in stopwords]
'eng', 'x', 'm']]
if(keyword == '' or keyword in words):
result.append({
'posString': ' '.join(words),

@ -19,3 +19,52 @@
新聞
標題
內文
可以
沒有
就是
自己
大家
我們
知道
網址
備註
連結
所以

@ -10,6 +10,7 @@ var wordTitleList
var randId
var globKeyword = ''
var stopwords = []
var tsvString
function init() {
$.ajax({
@ -35,7 +36,7 @@ function init() {
$('#graphInfo').empty()
$('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + ',' + keywordCountString)
totalPosts = json.info.posts
buildSentetree(tsvString)
buildSentetree()
}
})
$(document).ready(function() {
@ -102,7 +103,6 @@ function addStopWord() {
}
function showStopwordEditor() {
console.log(stopwords)
$(window).unbind('keydown')
$(window).keydown(function(event) {
if (event.keyCode == 13) {
@ -175,7 +175,7 @@ function sendRequest() {
startDate: $('#startDate').val(),
endDate: $('#endDate').val(),
keyword: $('#keywordBox').val(),
stopwords: stopwords,
stopwords: [],
pos: {
noun: $('#noun').is(':checked'),
verb: $('#verb').is(':checked'),
@ -216,19 +216,21 @@ function changeGraph(data) {
$('#graphInfo').attr('style', 'margin: 10px;').append('總文章數:' + json.info.posts + keywordCountString)
totalPosts = json.info.posts
destroyCurrentGraph()
d3.select('#graph').append('div').attr('id', 'vis')
buildSentetree(tsvString)
buildSentetree()
}
function destroyCurrentGraph() {
d3.selectAll('#vis').remove()
d3.select('#graph').append('div').attr('id', 'vis')
}
function hideTitles() {
$('#titleListLayer').addClass('hidden')
$('#setToKeyword').unbind()
$("#addToStopwords").unbind()
}
function buildSentetree(tsvString) {
function buildSentetree() {
console.log("Build.")
var model;
var tree;
@ -253,10 +255,9 @@ function buildSentetree(tsvString) {
fontSize: [15, 40],
gapBetweenGraph: 10
});
console.log(tree)
tree.data(model.getRenderedGraphs(2))
.on('nodeClick', node => {
$("#keywordBox").val(node.data.entity)
$('#titleListLayer').removeClass('hidden')
seqList = node.data.seq.DBs.map(function(n) {
return n.rawText
})
@ -273,6 +274,27 @@ function buildSentetree(tsvString) {
info = wordTitleList[node.data.entity]
$('#titleListKeyword').html(node.data.entity)
$('#titleListKeywordInfo').html('')
if (stopwords.indexOf(node.data.entity) < 0) {
$("#addToStopwords").html('設為停用詞').css('background-color', '#379').click(() => {
stopwords.push(node.data.entity)
destroyCurrentGraph()
buildSentetree()
hideTitles()
})
} else {
$("#addToStopwords").html('從停用詞移除').css('background-color', '#933').click(() => {
stopwords.pop(node.data.entity)
destroyCurrentGraph()
buildSentetree()
hideTitles()
})
}
$('#setToKeyword').click(() => {
$('#keywordBox').val(node.data.entity)
sendRequest()
hideTitles()
})
$('#titleListLayer').removeClass('hidden')
$.ajax({
type: 'POST',
url: '/ptt/keywordFrequency',

File diff suppressed because it is too large Load Diff

@ -19143,7 +19143,7 @@
words.forEach(function(w) {
var value = fdist[w];
if (value < maxSupport && value > maxc) {
if (value < maxSupport && value > maxc && (isNotRoot || stopwords.indexOf(itemset[w]) < 0)) {
maxw = +w;
maxc = value;
}
@ -19185,7 +19185,7 @@
}
}
return { word: word, pos: pos, count: count, s0: s0, s1: s1 };
return { word: word, pos: pos, count: stopwords.indexOf(itemset[word]) < 0 ? count : minSupport, s0: s0, s1: s1 };
}
function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) {
@ -19344,6 +19344,7 @@
var graphs = [];
var visibleGroups = expandSeqTree(this.rootSeq, graphs, DEFAULT_NODE_COUNT, minSupport, maxSupport, this.terms, itemset);
this.graphs = graphs.filter(function(g) {
return g.nodes.length > 2;
}).slice(0, 10);
@ -19376,9 +19377,11 @@
key: 'getRenderedGraphs',
value: function getRenderedGraphs(limit) {
var graphs = arguments.length === 1 ? this.graphs.slice(0, limit) : this.graphs;
console.log("slice")
var renderedGraphs = graphs.map(function(g) {
return g.toRenderedGraph();
});
console.log("toRenderedGraph")
var globalFreqRange = [(0, _lodash.min)(renderedGraphs.map(function(g) {
return g.freqRange[0];
})), (0, _lodash.max)(renderedGraphs.map(function(g) {
@ -37424,7 +37427,6 @@
}).filter(function(entry) {
return entry.tokens.length > 0;
});
return new _TokenizedDataset2.default(tokenizedEntries);
}
}, {
@ -37855,13 +37857,18 @@
heap.push(n);
});
let counter = 1;
while (heap.size() > 0) {
console.log(`in while ${counter++}`)
var parent = heap.pop();
console.log(heap)
if (parent.merged) {
continue;
}
var groups = [];
console.log(parent.data.id)
if (parent.leftLinks.length > 1) {
var lNodes = parent.leftLinks.map(function(l) {
return l.source;
@ -37875,6 +37882,7 @@
});
groups = groups.concat(this.groupMergeableNodes(rNodes));
}
console.log(groups)
if (groups.length > 0) {
var newNodes = groups.map(function(group) {
@ -38102,6 +38110,7 @@
var RenderedGraph = function() {
function RenderedGraph(rawGraph) {
console.log(arguments)
var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
_ref$bundle = _ref.bundle,
bundle = _ref$bundle === undefined ? true : _ref$bundle,
@ -38134,8 +38143,10 @@
this.assignNodeIds(nodes);
console.log(bundle)
if (bundle) {
var bundled = new _GraphBundler2.default(nodes, links).bundle();
console.log(bundled)
this.nodes = bundled.nodes;
this.links = bundled.links;
this.assignNodeIds(this.nodes);

@ -1,110 +1,110 @@
{
"_from": "sententree",
"_id": "sententree@1.0.0",
"_inBundle": false,
"_integrity": "sha1-xIKf0Tmu5dgroRPaDRqjtR2vWas=",
"_location": "/sententree",
"_phantomChildren": {},
"_requested": {
"type": "tag",
"registry": true,
"raw": "sententree",
"_from": "sententree",
"_id": "sententree@1.0.0",
"_inBundle": false,
"_integrity": "sha1-xIKf0Tmu5dgroRPaDRqjtR2vWas=",
"_location": "/sententree",
"_phantomChildren": {},
"_requested": {
"type": "tag",
"registry": true,
"raw": "sententree",
"name": "sententree",
"escapedName": "sententree",
"rawSpec": "",
"saveSpec": null,
"fetchSpec": "latest"
},
"_requiredBy": [
"#USER",
"/"
],
"_resolved": "https://registry.npmjs.org/sententree/-/sententree-1.0.0.tgz",
"_shasum": "c4829fd139aee5d82ba113da0d1aa3b51daf59ab",
"_spec": "sententree",
"_where": "C:\\Users\\a5640\\OneDrive - National ChengChi University\\programming\\Python\\flask\\sententree\\static",
"author": "",
"bugs": {
"url": "https://github.com/twitter/sententree/issues"
},
"bundleDependencies": false,
"dependencies": {
"d3": "^4.4.1",
"d3kit": "^3.2.0",
"heap": "^0.2.6",
"lodash": "^4.17.4",
"webcola": "^3.3.0"
},
"deprecated": false,
"description": "A novel text visualization technique",
"devDependencies": {
"babel-core": "^6.22.1",
"babel-eslint": "^6.1.2",
"babel-loader": "^6.2.10",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-istanbul": "^2.0.1",
"babel-plugin-transform-object-assign": "^6.22.0",
"babel-preset-es2015": "^6.3.13",
"babel-preset-react": "^6.22.0",
"browser-sync": "~2.14.0",
"chai": "^3.5.0",
"eslint": "^2.9.0",
"eslint-config-airbnb": "^9.0.1",
"eslint-plugin-import": "^1.12.0",
"eslint-plugin-jsx-a11y": "^1.2.0",
"eslint-plugin-mocha": "^4.7.0",
"eslint-plugin-react": "^5.0.1",
"gh-pages": "^0.11.0",
"gulp": "^3.9.1",
"gulp-sass": "^3.1.0",
"karma": "~0.13.15",
"karma-chai": "^0.1.0",
"karma-coverage": "~0.5.3",
"karma-mocha": "^1.1.1",
"karma-mocha-reporter": "^2.1.0",
"karma-phantomjs-launcher": "^1.0.1",
"karma-webpack": "^2.0.2",
"mocha": "^3.0.2",
"node-sass": "^4.5.0",
"pkgfiles": "^2.3.0",
"react": "^15.4.2",
"react-d3kit": "^1.2.4",
"react-dom": "^15.4.2",
"uglifyjs": "^2.4.10",
"webpack": "^2.2.1"
},
"files": [
"src/**/*.*",
"dist/*.*"
],
"homepage": "https://github.com/twitter/sententree#readme",
"keywords": [
"sententree",
"visualization"
],
"license": "Apache-2.0",
"main": "dist/SentenTree.js",
"name": "sententree",
"escapedName": "sententree",
"rawSpec": "",
"saveSpec": null,
"fetchSpec": "latest"
},
"_requiredBy": [
"#USER",
"/"
],
"_resolved": "https://registry.npmjs.org/sententree/-/sententree-1.0.0.tgz",
"_shasum": "c4829fd139aee5d82ba113da0d1aa3b51daf59ab",
"_spec": "sententree",
"_where": "C:\\Users\\a5640\\OneDrive - National ChengChi University\\programming\\Python\\flask\\sententree\\static",
"author": "",
"bugs": {
"url": "https://github.com/twitter/sententree/issues"
},
"bundleDependencies": false,
"dependencies": {
"d3": "^4.4.1",
"d3kit": "^3.2.0",
"heap": "^0.2.6",
"lodash": "^4.17.4",
"webcola": "^3.3.0"
},
"deprecated": false,
"description": "A novel text visualization technique",
"devDependencies": {
"babel-core": "^6.22.1",
"babel-eslint": "^6.1.2",
"babel-loader": "^6.2.10",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-istanbul": "^2.0.1",
"babel-plugin-transform-object-assign": "^6.22.0",
"babel-preset-es2015": "^6.3.13",
"babel-preset-react": "^6.22.0",
"browser-sync": "~2.14.0",
"chai": "^3.5.0",
"eslint": "^2.9.0",
"eslint-config-airbnb": "^9.0.1",
"eslint-plugin-import": "^1.12.0",
"eslint-plugin-jsx-a11y": "^1.2.0",
"eslint-plugin-mocha": "^4.7.0",
"eslint-plugin-react": "^5.0.1",
"gh-pages": "^0.11.0",
"gulp": "^3.9.1",
"gulp-sass": "^3.1.0",
"karma": "~0.13.15",
"karma-chai": "^0.1.0",
"karma-coverage": "~0.5.3",
"karma-mocha": "^1.1.1",
"karma-mocha-reporter": "^2.1.0",
"karma-phantomjs-launcher": "^1.0.1",
"karma-webpack": "^2.0.2",
"mocha": "^3.0.2",
"node-sass": "^4.5.0",
"pkgfiles": "^2.3.0",
"react": "^15.4.2",
"react-d3kit": "^1.2.4",
"react-dom": "^15.4.2",
"uglifyjs": "^2.4.10",
"webpack": "^2.2.1"
},
"files": [
"src/**/*.*",
"dist/*.*"
],
"homepage": "https://github.com/twitter/sententree#readme",
"keywords": [
"sententree",
"visualization"
],
"license": "Apache-2.0",
"main": "dist/SentenTree.min.js",
"name": "sententree",
"repository": {
"type": "git",
"url": "git+https://github.com/twitter/sententree.git"
},
"scripts": {
"build": "npm run build-lib && npm run build-standalone",
"build-demo": "NODE_ENV=production webpack --config webpack.config.demo.js && gulp sass",
"build-lib": "NODE_ENV=production webpack && uglifyjs dist/sententree.js -m -c > dist/sententree.min.js",
"build-standalone": "NODE_ENV=production webpack --config webpack.config.standalone.js && uglifyjs dist/sententree-standalone.js -m -c > dist/sententree-standalone.min.js",
"clean": "rm -rf dist",
"eslint": "eslint --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"eslint-fix": "eslint --fix --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"gh-pages": "npm run build-demo && gh-pages -d demo",
"postversion": "git push ; git push --tags; pkgfiles",
"prepublish": "pkgfiles",
"server": "gulp server",
"start": "webpack --watch --config webpack.config.demo.js & npm run server",
"tdd": "NODE_ENV=test karma start",
"test": "NODE_ENV=test karma start --single-run",
"version": "npm run build && git add -A dist"
},
"version": "1.0.0"
"repository": {
"type": "git",
"url": "git+https://github.com/twitter/sententree.git"
},
"scripts": {
"build": "npm run build-lib && npm run build-standalone",
"build-demo": "NODE_ENV=production webpack --config webpack.config.demo.js && gulp sass",
"build-lib": "NODE_ENV=production webpack && uglifyjs dist/sententree.js -m -c > dist/sententree.min.js",
"build-standalone": "NODE_ENV=production webpack --config webpack.config.standalone.js && uglifyjs dist/sententree-standalone.js -m -c > dist/sententree-standalone.min.js",
"clean": "rm -rf dist",
"eslint": "eslint --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"eslint-fix": "eslint --fix --ignore-path .gitignore \"src/**/*.@(js|jsx)\"",
"gh-pages": "npm run build-demo && gh-pages -d demo",
"postversion": "git push ; git push --tags; pkgfiles",
"prepublish": "pkgfiles",
"server": "gulp server",
"start": "webpack --watch --config webpack.config.demo.js & npm run server",
"tdd": "NODE_ENV=test karma start",
"test": "NODE_ENV=test karma start --single-run",
"version": "npm run build && git add -A dist"
},
"version": "1.0.0"
}

@ -5,141 +5,141 @@ import Link from './Link.js';
import Node from './Node.js';
export default class RenderedGraph {
constructor(rawGraph, {
bundle = true,
highFrequencyOnTop = true,
constructor(rawGraph, {
bundle = true,
highFrequencyOnTop = true,
} = {}) {
this.options = { bundle, highFrequencyOnTop };
this.minSupport = rawGraph.minSupport;
this.maxSupport = rawGraph.maxSupport;
const nodes = rawGraph.nodes.map(n => new Node(n));
const links = [];
Object.keys(rawGraph.linkadj).forEach(l => {
const leftNode = nodes[l];
const rights = rawGraph.linkadj[l];
Object.keys(rights).forEach(r => {
const rightNode = nodes[r];
const link = new Link(
leftNode,
rightNode,
rights[r]
);
links.push(link);
leftNode.rightLinks.push(link);
rightNode.leftLinks.push(link);
});
});
this.assignNodeIds(nodes);
if (bundle) {
const bundled = new GraphBundler(nodes, links).bundle();
this.nodes = bundled.nodes;
this.links = bundled.links;
this.assignNodeIds(this.nodes);
} else {
this.nodes = nodes;
this.links = links;
this.options = { bundle, highFrequencyOnTop };
this.minSupport = rawGraph.minSupport;
this.maxSupport = rawGraph.maxSupport;
const nodes = rawGraph.nodes.map(n => new Node(n));
const links = [];
Object.keys(rawGraph.linkadj).forEach(l => {
const leftNode = nodes[l];
const rights = rawGraph.linkadj[l];
Object.keys(rights).forEach(r => {
const rightNode = nodes[r];
const link = new Link(
leftNode,
rightNode,
rights[r]
);
links.push(link);
leftNode.rightLinks.push(link);
rightNode.leftLinks.push(link);
});
});
this.assignNodeIds(nodes);
if (bundle) {
const bundled = new GraphBundler(nodes, links).bundle();
this.nodes = bundled.nodes;
this.links = bundled.links;
this.assignNodeIds(this.nodes);
} else {
this.nodes = nodes;
this.links = links;
}
this.linkLookup = keyBy(this.links, l => l.getKey());
if (highFrequencyOnTop) {
this.nodes.forEach(n => {
n.rightLinks.sort((a, b) => b.freq - a.freq);
n.leftLinks.sort((a, b) => b.freq - a.freq);
});
}
const frequencies = this.nodes.map(n => n.data.freq);
this.freqRange = [min(frequencies), max(frequencies)];
this.globalFreqRange = this.freqRange;
const onlyBridgeConstraints = this.links
.filter(link => link.isTheOnlyBridge())
.map(link => link.toOnlyBridgeConstraint());
this.baseConstraints = onlyBridgeConstraints
.concat(this.getAlignmentConstraints());
}
this.linkLookup = keyBy(this.links, l => l.getKey());
if (highFrequencyOnTop) {
this.nodes.forEach(n => {
n.rightLinks.sort((a, b) => b.freq - a.freq);
n.leftLinks.sort((a, b) => b.freq - a.freq);
});
updateNodeSize(sizeFn) {
this.nodes.forEach(node => {
const { width, height } = sizeFn(node);
node.width = width;
node.height = height;
});
return this;
}
const frequencies = this.nodes.map(n => n.data.freq);
this.freqRange = [min(frequencies), max(frequencies)];
this.globalFreqRange = this.freqRange;
const onlyBridgeConstraints = this.links
.filter(link => link.isTheOnlyBridge())
.map(link => link.toOnlyBridgeConstraint());
this.baseConstraints = onlyBridgeConstraints
.concat(this.getAlignmentConstraints());
}
updateNodeSize(sizeFn) {
this.nodes.forEach(node => {
const { width, height } = sizeFn(node);
node.width = width;
node.height = height;
});
return this;
}
assignNodeIds(nodes, startIndex = 0) {
nodes.forEach((n, i) => { n.id = i + startIndex; });
return this;
}
getAlignmentConstraints() {
const alignmentConstraints = [];
if (this.nodes.length > 0) {
const visitedNodes = this.nodes.map(() => false);
let queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeRightConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const rNodes = node.getRightNodes();
if (rNodes.length > 0) {
queue = queue.concat(rNodes);
}
}
for (let i = 0; i < this.nodes.length; i++) {
visitedNodes[i] = false;
}
queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeLeftConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const lNodes = node.getLeftNodes();
if (lNodes.length > 0) {
queue = queue.concat(lNodes);
}
}
assignNodeIds(nodes, startIndex = 0) {
nodes.forEach((n, i) => { n.id = i + startIndex; });
return this;
}
return alignmentConstraints;
}
getAlignmentConstraints() {
const alignmentConstraints = [];
if (this.nodes.length > 0) {
const visitedNodes = this.nodes.map(() => false);
let queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeRightConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const rNodes = node.getRightNodes();
if (rNodes.length > 0) {
queue = queue.concat(rNodes);
}
}
for (let i = 0; i < this.nodes.length; i++) {
visitedNodes[i] = false;
}
queue = [this.nodes[0]];
while (queue.length > 0) {
const node = queue.shift();
const nodeIndex = node.id;
if (visitedNodes[nodeIndex]) continue;
visitedNodes[nodeIndex] = true;
const constraints = node.computeLeftConstraints();
if (constraints) {
alignmentConstraints.push(constraints);
}
const lNodes = node.getLeftNodes();
if (lNodes.length > 0) {
queue = queue.concat(lNodes);
}
}
}
return alignmentConstraints;
}
getLinkConstraints() {
return this.links.map(l => l.toConstraint());
}
getLinkConstraints() {
return this.links.map(l => l.toConstraint());
}
getConstraints() {
const constraints = this.baseConstraints
.concat(this.links.map(l => l.toConstraint()));
getConstraints() {
const constraints = this.baseConstraints
.concat(this.links.map(l => l.toConstraint()));
return this.options.highFrequencyOnTop
? constraints.concat(flatMap(this.nodes, n => n.computeOrderConstraints()))
: constraints;
}
return this.options.highFrequencyOnTop ?
constraints.concat(flatMap(this.nodes, n => n.computeOrderConstraints())) :
constraints;
}
toGroupConstraint() {
return {
leaves: this.nodes.map(n => n.id),
};
}
toGroupConstraint() {
return {
leaves: this.nodes.map(n => n.id),
};
}
}

@ -6,278 +6,278 @@ import RawGraph from './RawGraph.js';
const DEFAULT_NODE_COUNT = 150;
function growSeq(seq, terms, minSupport, maxSupport, itemset) {
/* find the next frequent sequence by inserting a new word to current sequence */
let pos = -1;
let word = null;
let count = 0;
const len = seq.words.length;
for (let s = 0; s <= len; s++) {
const fdist = {};
seq.DBs.forEach(t => {
const l = s === 0 ? 0 : t.seqIndices[s - 1] + 1;
const r = s === len ? t.tokens.length : t.seqIndices[s];
const duplicate = {};
for (let i = l; i < r; i++) {
const w = t.tokens[i];
if (duplicate[w]) continue;
duplicate[w] = true;
if (w in fdist) {
fdist[w] += t.count;
} else {
fdist[w] = t.count;
/* find the next frequent sequence by inserting a new word to current sequence */
let pos = -1;
let word = null;
let count = 0;
const len = seq.words.length;
for (let s = 0; s <= len; s++) {
const fdist = {};
seq.DBs.forEach(t => {
const l = s === 0 ? 0 : t.seqIndices[s - 1] + 1;
const r = s === len ? t.tokens.length : t.seqIndices[s];
const duplicate = {};
for (let i = l; i < r; i++) {
const w = t.tokens[i];
if (duplicate[w]) continue;
duplicate[w] = true;
if (w in fdist) {
fdist[w] += t.count;
} else {
fdist[w] = t.count;
}
}
});
let maxw = null;
let maxc = 0;
const isNotRoot = len > 0;
const words = isNotRoot ?
Object.keys(fdist) :
Object.keys(fdist).filter(w => !itemset[w].startsWith('#'));
words.forEach(w => {
const value = fdist[w];
if (value < maxSupport && value > maxc) {
maxw = +w;
maxc = value;
}
});
if (maxc > count) {
pos = s;
word = maxw;
count = maxc;
}
}
});
let maxw = null;
let maxc = 0;
const isNotRoot = len > 0;
const words = isNotRoot
? Object.keys(fdist)
: Object.keys(fdist).filter(w => !itemset[w].startsWith('#'));
words.forEach(w => {
const value = fdist[w];
if (value < maxSupport && value > maxc) {
maxw = +w;
maxc = value;
}
});
if (maxc > count) {
pos = s;
word = maxw;
count = maxc;
}
}
let s0 = null;
let s1 = null;
/* split the current group in two */
if (count >= minSupport) {
s0 = { size: 0, DBs: [] };
s1 = { size: 0, DBs: [] };
const words = seq.words;
for (let ti = 0; ti < seq.DBs.length; ti ++) {
const t = seq.DBs[ti];
const l = pos === 0 ? 0 : t.seqIndices[pos - 1] + 1;
const r = pos === words.length ? t.tokens.length : t.seqIndices[pos];
let i = t.tokens.slice(l, r).indexOf(word);
if (i < 0) {
s0.DBs.push(t);
s0.size += t.count;
} else {
i += l;
t.seqIndices.splice(pos, 0, i);
s1.DBs.push(t);
s1.size += t.count;
}
let s0 = null;
let s1 = null;
/* split the current group in two */
if (count >= minSupport) {
s0 = { size: 0, DBs: [] };
s1 = { size: 0, DBs: [] };
const words = seq.words;
for (let ti = 0; ti < seq.DBs.length; ti++) {
const t = seq.DBs[ti];
const l = pos === 0 ? 0 : t.seqIndices[pos - 1] + 1;
const r = pos === words.length ? t.tokens.length : t.seqIndices[pos];
let i = t.tokens.slice(l, r).indexOf(word);
if (i < 0) {
s0.DBs.push(t);
s0.size += t.count;
} else {
i += l;
t.seqIndices.splice(pos, 0, i);
s1.DBs.push(t);
s1.size += t.count;
}
}
}
}
return { word, pos, count, s0, s1 };
return { word, pos, count, s0, s1 };
}
function expandSeqTree(rootSeq, graphs, expandCnt, minSupport, maxSupport, terms, itemset) {
if (rootSeq.words && rootSeq.words.length > 0) {
rootSeq.graph.nodes = rootSeq.graph.nodes.concat(rootSeq.words);
expandCnt -= rootSeq.words.length;
}
/* Create a max heap */
const seqs = new Heap((a, b) => b.size - a.size);
seqs.push(rootSeq);
const leafSeqs = [];
while (!seqs.empty() && expandCnt > 0) {
/* find the candidate sequence with largest support DB */
const s = seqs.pop();
let graph = s.graph;
let s0 = s.r;
let s1 = s.l;
if (!s0 && !s1) {
/* find the next frequent sequence */
const result = growSeq(s, terms, minSupport, maxSupport, itemset);
s0 = result.s0;
s1 = result.s1;
const { word, pos, count } = result;
if (count < minSupport) {
leafSeqs.push(s);
} else {
/* create new sequences and add new word */
if (!graph) {
graph = new RawGraph(minSupport, maxSupport);
graphs.push(graph);
}
const newWord = {
id: graph.totalNodeCnt++,
entity: itemset[word],
freq: count,
topEntries: s1.DBs.slice(0, 5),
seq: s1,
};
const newWords = s.words.slice();
newWords.splice(pos, 0, newWord);
s0.words = s.words;
s1.words = newWords;
s1.newWord = newWord;
s0.graph = s.graph;
s1.graph = graph;
}
if (rootSeq.words && rootSeq.words.length > 0) {
rootSeq.graph.nodes = rootSeq.graph.nodes.concat(rootSeq.words);
expandCnt -= rootSeq.words.length;
}
if (s1) {
s1.graph.nodes.push(s1.newWord);
expandCnt--;
}
/* Create a max heap */
const seqs = new Heap((a, b) => b.size - a.size);
seqs.push(rootSeq);
const leafSeqs = [];
while (!seqs.empty() && expandCnt > 0) {
/* find the candidate sequence with largest support DB */
const s = seqs.pop();
let graph = s.graph;
let s0 = s.r;
let s1 = s.l;
if (!s0 && !s1) {
/* find the next frequent sequence */
const result = growSeq(s, terms, minSupport, maxSupport, itemset);
s0 = result.s0;
s1 = result.s1;
const { word, pos, count } = result;
if (count < minSupport) {
leafSeqs.push(s);
} else {
/* create new sequences and add new word */
if (!graph) {
graph = new RawGraph(minSupport, maxSupport);
graphs.push(graph);
}
const newWord = {
id: graph.totalNodeCnt++,
entity: itemset[word],
freq: count,
topEntries: s1.DBs.slice(0, 5),
seq: s1,
};
const newWords = s.words.slice();
newWords.splice(pos, 0, newWord);
s0.words = s.words;
s1.words = newWords;
s1.newWord = newWord;
s0.graph = s.graph;
s1.graph = graph;
}
}
/* add new sequences to seqTree */
s.l = s1;
s.r = s0;
if (s1) {
s1.graph.nodes.push(s1.newWord);
expandCnt--;
}
/* add new sequences to candidates */
if (s1) {
seqs.push(s1);
}
if (s0 && s0.size >= minSupport) {
seqs.push(s0);
/* add new sequences to seqTree */
s.l = s1;
s.r = s0;
/* add new sequences to candidates */
if (s1) {
seqs.push(s1);
}
if (s0 && s0.size >= minSupport) {
seqs.push(s0);
}
}
}
return leafSeqs.concat(seqs.toArray());
return leafSeqs.concat(seqs.toArray());
}
function updateNodesEdges(graphs, leafSeqs) {
leafSeqs
.filter(seq => graphs.indexOf(seq.graph) >= 0)
.forEach(seq => {
const words = seq.words;
const linkadj = seq.graph.linkadj;
// printSeq(seq);
for (let i = 0; i < words.length - 1; i++) {
const word = words[i];
const id = word.id;
const nextId = words[i + 1].id;
if (!(id in linkadj)) linkadj[id] = {};
if (nextId in linkadj[id]) {
linkadj[id][nextId] += seq.size;
} else {
linkadj[id][nextId] = seq.size;
}
}
words
.filter(word => !word.leafSeq || word.leafSeq < seq.size)
.forEach(word => { word.leafSeq = seq; });
});
leafSeqs
.filter(seq => graphs.indexOf(seq.graph) >= 0)
.forEach(seq => {
const words = seq.words;
const linkadj = seq.graph.linkadj;
// printSeq(seq);
for (let i = 0; i < words.length - 1; i++) {
const word = words[i];
const id = word.id;
const nextId = words[i + 1].id;
if (!(id in linkadj)) linkadj[id] = {};
if (nextId in linkadj[id]) {
linkadj[id][nextId] += seq.size;
} else {
linkadj[id][nextId] = seq.size;
}
}
words
.filter(word => !word.leafSeq || word.leafSeq < seq.size)
.forEach(word => { word.leafSeq = seq; });
});
}
function printSeq(words) {
const str = words.map(w => w.entity).join(' ');
console.log(str);
const str = words.map(w => w.entity).join(' ');
console.log(str);
}
export default class SentenTreeModel {
constructor(tokenizedData, options = {}) {
// extract options
const {
termWeights = {},
// minimum support is the max of
// minSupportCount
// and size * minSupportRatio
minSupportCount = 2,
minSupportRatio = 0.001,
maxSupportRatio = 0.75,
} = options;
this.options = options;
const { itemset, entries } = tokenizedData;
this.tokenizedData = tokenizedData;
this.terms = tokenizedData.encodeTermWeights(termWeights);
const size = tokenizedData.computeSize();
this.supportRange = [
Math.max(size * minSupportRatio, minSupportCount),
size * maxSupportRatio,
];
const [minSupport, maxSupport] = this.supportRange;
this.rootSeq = {
words: [],
newWord: null,
graph: null,
size,
DBs: entries,
};
const graphs = [];
const visibleGroups = expandSeqTree(
this.rootSeq,
graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
itemset
);
this.graphs = graphs
.filter(g => g.nodes.length > 2)
.slice(0, 10);
updateNodesEdges(this.graphs, visibleGroups);
}
updateGraphs(newRootSeq) {
this.graphs.forEach(g => g.clear());
const rootSeq = newRootSeq || this.rootSeq;
const [minSupport, maxSupport] = this.supportRange;
const visibleGroups = expandSeqTree(
rootSeq,
this.graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
this.tokenizedData.itemset
);
updateNodesEdges(this.graphs, visibleGroups);
return this;
}
size() {
return this.rootSeq.size;
}
getRenderedGraphs(limit) {
const graphs = arguments.length === 1
? this.graphs.slice(0, limit)
: this.graphs;
const renderedGraphs = graphs.map(g => g.toRenderedGraph());
const globalFreqRange = [
min(renderedGraphs.map(g => g.freqRange[0])),
max(renderedGraphs.map(g => g.freqRange[1])),
];
let idPool = 0;
renderedGraphs.forEach(g => {
g.globalFreqRange = globalFreqRange;
g.nodes.forEach(n => {
n.gid = idPool;
idPool++;
});
});
return renderedGraphs;
}
constructor(tokenizedData, options = {}) {
// extract options
const {
termWeights = {},
// minimum support is the max of
// minSupportCount
// and size * minSupportRatio
minSupportCount = 2,
minSupportRatio = 0.001,
maxSupportRatio = 0.75,
} = options;
this.options = options;
const { itemset, entries } = tokenizedData;
this.tokenizedData = tokenizedData;
this.terms = tokenizedData.encodeTermWeights(termWeights);
const size = tokenizedData.computeSize();
this.supportRange = [
Math.max(size * minSupportRatio, minSupportCount),
size * maxSupportRatio,
];
const [minSupport, maxSupport] = this.supportRange;
this.rootSeq = {
words: [],
newWord: null,
graph: null,
size,
DBs: entries,
};
const graphs = [];
const visibleGroups = expandSeqTree(
this.rootSeq,
graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
itemset
);
this.graphs = graphs
.filter(g => g.nodes.length > 2)
.slice(0, 10);
updateNodesEdges(this.graphs, visibleGroups);
}
updateGraphs(newRootSeq) {
this.graphs.forEach(g => g.clear());
const rootSeq = newRootSeq || this.rootSeq;
const [minSupport, maxSupport] = this.supportRange;
const visibleGroups = expandSeqTree(
rootSeq,
this.graphs,
DEFAULT_NODE_COUNT,
minSupport,
maxSupport,
this.terms,
this.tokenizedData.itemset
);
updateNodesEdges(this.graphs, visibleGroups);
return this;
}
size() {
return this.rootSeq.size;
}
getRenderedGraphs(limit) {
const graphs = arguments.length === 1 ?
this.graphs.slice(0, limit) :
this.graphs;
const renderedGraphs = graphs.map(g => g.toRenderedGraph());
const globalFreqRange = [
min(renderedGraphs.map(g => g.freqRange[0])),
max(renderedGraphs.map(g => g.freqRange[1])),
];
let idPool = 0;
renderedGraphs.forEach(g => {
g.globalFreqRange = globalFreqRange;
g.nodes.forEach(n => {
n.gid = idPool;
idPool++;
});
});
return renderedGraphs;
}
}

@ -41,6 +41,8 @@
<ul id="titleListContainer" class="w3-ul w3-hoverable"></ul>
<div id="backButton" style="margin: 20px 0px;">
<button class="general-button" type="button" id="confirm" style="background-color: #379; margin: 0px 20px" onclick="hideTitles()">返回</button>
<button class="general-button" type="button" id="setToKeyword" style="background-color: #379; margin: 0px 20px">設為關鍵詞</button>
<button class="general-button" type='button' id='addToStopwords' style='background-color: #379;margin: 0px 40px;position: absolute;right: 0px;'></button>
</div>
</div>
</div>

Loading…
Cancel
Save