keep state and continue

This commit is contained in:
2020-01-11 09:02:12 +08:00
parent 6134289d0a
commit 0cf04c3f79
7 changed files with 170 additions and 123 deletions

View File

@ -22,6 +22,7 @@
"scripts/background/result.js",
"scripts/background/signiture.js",
"scripts/background/actions.js",
"scripts/background/task.js",
"scripts/background/extractor.js",
"scripts/background/helpers.js"
],

View File

@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
```
## Advanced Usage
## Stop Tasks
### Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`.
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> If you typed wrong selectors, the task waits forever for elements which don't exists.
But if you typed wrong selectors, the task waits forever for elements which don't exists.
The only way to stop tasks before its finish, is `Closing the host tab`.
### Extract Attributes.
## Extract Attributes.
e.g.: link text and target (use 'selector@attribute')
@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start();
```
## Advanced Usage
### Use Task Chain.
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
```js
new Extractor()
.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
### Continue Tasks
You can always continue tasks (with following), even it stops in the middle of a task:
```js
e.start()
```
The `Extractor` kept the state of last execution, and starts from where it stopped.
### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task?
```js
// restart all tasks
e.restart(0)
// restart from 2nd task
e.restart(1)
```
### Save Result of Any Task
To a multiple task (chain) Extractor `e`:
To a multiple task Extractor `e`:
```js
e = new Extractor()
@ -98,37 +118,12 @@ Incase you want to save it again, use:
e.save()
```
You may want to save another task's result, other than the final:
To save another task result, other than the final one:
```js
// save the result of first task
// to the example above, that is a list of urls
e.save(0)
// save the result of second task
e.save(1)
```
### Restart Tasks
In cases some later task fails, you don't need to restart all task.
Here we have 2 tasks:
```js
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
Suppose the second task fails, we can restart and continue from the task 2:
```js
e.restart(2);
```
If you'd like restart all task, use:
```js
e.start();
// or
e.restart();
```

View File

@ -55,7 +55,7 @@ function parseUrls(...args) {
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v);
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {

View File

@ -1,41 +1,44 @@
class Extractor {
constructor() {
this._tasks = [];
this._tab = undefined;
this._running = false;
this._results = {};
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args)) {
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
return this;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
this._tasks.push(new Task(...args));
return this;
}
/**
* Clear tasks and caches.
* Clear tasks and task caches.
*/
clear() {
this._tasks = [];
this._results = [];
return this;
}
/**
* Start the task chain.
*/
async start() {
return this._startTasks(0);
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from = 0) {
let id = this._checkTaskId(from, 0);
if (!id) return;
for (let i = id; i < this._tasks.length; i++) {
this._tasks[i].clean();
}
return this._startTasks(0);
}
async _startTasks(from) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
@ -44,68 +47,28 @@ class Extractor {
console.log('No task to run.');
return;
}
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) {
let tab;
let task = this._tasks[0];
if (task.urls.length) {
// task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
this._tab = await createTab(urls[0], false);
tab = await createTab(task.urls[0], false);
} else {
this._tab = await getActiveTab(false);
tab = await getActiveTab(true) || await getActiveTab(false);
}
this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => {
return this._tasks.reduce((pms, task, i) => {
return pms.then(
result => {
if (result === undefined) return getData(this._tab, ...args);
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
() => {
if (i < from) return;
if (i > 0) {
let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results));
}
return task.execute(tab, undefined);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
console.log("Tasks are all done.")
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
() => this.save()
).catch(err => {
this._running = false;
console.log(err)
@ -113,18 +76,15 @@ class Extractor {
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
* @param {number} taskid which task id to save, begins with 0
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return;
}
if (result.data.length <= 1) { // 1 for selector headers
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (!id) return;
let result = new ExtractResult(this._tasks[id].results);
if (!result.data.length) {
console.log(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
let msg = `
@ -141,9 +101,9 @@ ${result.toString(50) || "- Empty -"}
console.log("No task found.");
return 0;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
if (defaultId && id === undefined) id = defaultId;
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return 0;
}
return id

View File

@ -0,0 +1,66 @@
class Task {
// _manager = undefined;
// _id = 0;
// _urls = [];
_data = {};
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._urls.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
clean() {
this._data = {};
}
async execute(tab, upstreamData) {
if (!tab) throw new Error("No tab to execute the task.");
if (!this._urls.length) {
if (upstreamData) {
this._urls = parseUrls(upstreamData);
} else {
this._urls = [await queryUrl(tab)];
}
}
return this._urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = this._urls[i - 1];
this._data[lastURL] = results;
}
}
return this._data[url] ? MSG_URL_SKIPPED : redirectTab(tab, url).then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = this._urls[this._urls.length - 1];
this._data[lastURL] = results;
return;
}
}
);
}
}

View File

@ -1,8 +1,11 @@
const EXT_NAME = "DataExtracter";
const URL_REG = getWebUrl();
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");

File diff suppressed because one or more lines are too long