keep state and continue

This commit is contained in:
2020-01-11 09:02:12 +08:00
parent 6134289d0a
commit 0cf04c3f79
7 changed files with 170 additions and 123 deletions

View File

@ -55,7 +55,7 @@ function parseUrls(...args) {
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v);
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {

View File

@ -1,41 +1,44 @@
class Extractor {
constructor() {
this._tasks = [];
this._tab = undefined;
this._running = false;
this._results = {};
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args)) {
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
return this;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
this._tasks.push(new Task(...args));
return this;
}
/**
* Clear tasks and caches.
* Clear tasks and task caches.
*/
clear() {
this._tasks = [];
this._results = [];
return this;
}
/**
* Start the task chain.
*/
async start() {
return this._startTasks(0);
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from = 0) {
let id = this._checkTaskId(from, 0);
if (!id) return;
for (let i = id; i < this._tasks.length; i++) {
this._tasks[i].clean();
}
return this._startTasks(0);
}
async _startTasks(from) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
@ -44,68 +47,28 @@ class Extractor {
console.log('No task to run.');
return;
}
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) {
let tab;
let task = this._tasks[0];
if (task.urls.length) {
// task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
this._tab = await createTab(urls[0], false);
tab = await createTab(task.urls[0], false);
} else {
this._tab = await getActiveTab(false);
tab = await getActiveTab(true) || await getActiveTab(false);
}
this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => {
return this._tasks.reduce((pms, task, i) => {
return pms.then(
result => {
if (result === undefined) return getData(this._tab, ...args);
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
() => {
if (i < from) return;
if (i > 0) {
let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results));
}
return task.execute(tab, undefined);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
console.log("Tasks are all done.")
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
() => this.save()
).catch(err => {
this._running = false;
console.log(err)
@ -113,18 +76,15 @@ class Extractor {
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
* @param {number} taskid which task id to save, begins with 0
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return;
}
if (result.data.length <= 1) { // 1 for selector headers
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (!id) return;
let result = new ExtractResult(this._tasks[id].results);
if (!result.data.length) {
console.log(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
let msg = `
@ -141,9 +101,9 @@ ${result.toString(50) || "- Empty -"}
console.log("No task found.");
return 0;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
if (defaultId && id === undefined) id = defaultId;
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return 0;
}
return id

View File

@ -0,0 +1,66 @@
class Task {
// _manager = undefined;
// _id = 0;
// _urls = [];
_data = {};
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._urls.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
clean() {
this._data = {};
}
async execute(tab, upstreamData) {
if (!tab) throw new Error("No tab to execute the task.");
if (!this._urls.length) {
if (upstreamData) {
this._urls = parseUrls(upstreamData);
} else {
this._urls = [await queryUrl(tab)];
}
}
return this._urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = this._urls[i - 1];
this._data[lastURL] = results;
}
}
return this._data[url] ? MSG_URL_SKIPPED : redirectTab(tab, url).then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = this._urls[this._urls.length - 1];
this._data[lastURL] = results;
return;
}
}
);
}
}