keep state and continue
This commit is contained in:
@ -22,6 +22,7 @@
|
||||
"scripts/background/result.js",
|
||||
"scripts/background/signiture.js",
|
||||
"scripts/background/actions.js",
|
||||
"scripts/background/task.js",
|
||||
"scripts/background/extractor.js",
|
||||
"scripts/background/helpers.js"
|
||||
],
|
||||
|
||||
73
readme.md
73
readme.md
@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||||
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
## Stop Tasks
|
||||
|
||||
### Stop Tasks
|
||||
The only way to stop tasks before its finish, is `Closing the target tab`.
|
||||
|
||||
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
|
||||
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
|
||||
> If you typed wrong selectors, the task waits forever for elements which don't exists.
|
||||
|
||||
But if you typed wrong selectors, the task waits forever for elements which don't exists.
|
||||
|
||||
The only way to stop tasks before its finish, is `Closing the host tab`.
|
||||
|
||||
### Extract Attributes.
|
||||
## Extract Attributes.
|
||||
|
||||
e.g.: link text and target (use 'selector@attribute')
|
||||
|
||||
@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute')
|
||||
new Extractor().task('.item', ['a', 'a@href']).start();
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Use Task Chain.
|
||||
|
||||
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
|
||||
|
||||
```js
|
||||
new Extractor()
|
||||
.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||
e = new Extractor()
|
||||
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||
.task('list-item', ["a.title", "p.content"])
|
||||
.start();
|
||||
```
|
||||
|
||||
### Continue Tasks
|
||||
|
||||
You can always continue tasks (with following), even it stops in the middle of a task:
|
||||
|
||||
```js
|
||||
e.start()
|
||||
```
|
||||
|
||||
The `Extractor` kept the state of last execution, and starts from where it stopped.
|
||||
|
||||
### Restart Tasks
|
||||
|
||||
What should I do, if I don't like to continue from last state, but restart from certain task?
|
||||
|
||||
```js
|
||||
// restart all tasks
|
||||
e.restart(0)
|
||||
// restart from 2nd task
|
||||
e.restart(1)
|
||||
```
|
||||
|
||||
### Save Result of Any Task
|
||||
|
||||
To a multiple task (chain) Extractor `e`:
|
||||
To a multiple task Extractor `e`:
|
||||
|
||||
```js
|
||||
e = new Extractor()
|
||||
@ -98,37 +118,12 @@ Incase you want to save it again, use:
|
||||
e.save()
|
||||
```
|
||||
|
||||
You may want to save another task's result, other than the final:
|
||||
To save another task result, other than the final one:
|
||||
|
||||
```js
|
||||
// save the result of first task
|
||||
// to the example above, that is a list of urls
|
||||
e.save(0)
|
||||
// save the result of second task
|
||||
e.save(1)
|
||||
```
|
||||
|
||||
### Restart Tasks
|
||||
|
||||
In cases some later task fails, you don't need to restart all task.
|
||||
|
||||
Here we have 2 tasks:
|
||||
|
||||
```js
|
||||
e = new Extractor()
|
||||
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||
.task('list-item', ["a.title", "p.content"])
|
||||
.start();
|
||||
```
|
||||
|
||||
Suppose the second task fails, we can restart and continue from the task 2:
|
||||
|
||||
```js
|
||||
e.restart(2);
|
||||
```
|
||||
|
||||
If you'd like restart all task, use:
|
||||
|
||||
```js
|
||||
e.start();
|
||||
// or
|
||||
e.restart();
|
||||
```
|
||||
@ -55,7 +55,7 @@ function parseUrls(...args) {
|
||||
if (arg instanceof Array) {
|
||||
return arg;
|
||||
} else if (arg instanceof ExtractResult) {
|
||||
return arg.squash().filter(v => !!v);
|
||||
return arg.squash().filter(v => URL_REG.test(v));
|
||||
} else {
|
||||
let urlTempl = arg;
|
||||
if (urlTempl) {
|
||||
|
||||
@ -1,41 +1,44 @@
|
||||
class Extractor {
|
||||
constructor() {
|
||||
this._tasks = [];
|
||||
this._tab = undefined;
|
||||
this._running = false;
|
||||
this._results = {};
|
||||
}
|
||||
/**
|
||||
* Add a task to Extractor. \n
|
||||
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
||||
* Later task will use previous task result as input (target url list).
|
||||
* So only the first task can have target url arguments, while later tasks can't.
|
||||
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
|
||||
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
||||
*/
|
||||
task(...args) {
|
||||
if (!testArgs(...args)) {
|
||||
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
|
||||
return this;
|
||||
}
|
||||
// given >2 arguments means the task specifies target page,
|
||||
// so it won't accept last task result as url list.
|
||||
// in this case, former tasks are useless, can be cleared.
|
||||
if (args.length > 2) this.clear();
|
||||
this._tasks.push(args);
|
||||
this._tasks.push(new Task(...args));
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Clear tasks and caches.
|
||||
* Clear tasks and task caches.
|
||||
*/
|
||||
clear() {
|
||||
this._tasks = [];
|
||||
this._results = [];
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Start the task chain.
|
||||
*/
|
||||
async start() {
|
||||
return this._startTasks(0);
|
||||
}
|
||||
/**
|
||||
* restart from specified task, but don't restart the previous tasks.
|
||||
* @param {number} from where to restart the tasks, begins with 0
|
||||
*/
|
||||
async restart(from = 0) {
|
||||
let id = this._checkTaskId(from, 0);
|
||||
if (!id) return;
|
||||
for (let i = id; i < this._tasks.length; i++) {
|
||||
this._tasks[i].clean();
|
||||
}
|
||||
return this._startTasks(0);
|
||||
}
|
||||
async _startTasks(from) {
|
||||
if (this._running) {
|
||||
console.log('The Extractor is running. Please wait..');
|
||||
return;
|
||||
@ -44,68 +47,28 @@ class Extractor {
|
||||
console.log('No task to run.');
|
||||
return;
|
||||
}
|
||||
let firstTaskArgs = this._tasks[0];
|
||||
if (firstTaskArgs.length > 2) {
|
||||
|
||||
let tab;
|
||||
let task = this._tasks[0];
|
||||
if (task.urls.length) {
|
||||
// task specifies target urls, create new tab with first url for it
|
||||
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
|
||||
this._tab = await createTab(urls[0], false);
|
||||
tab = await createTab(task.urls[0], false);
|
||||
} else {
|
||||
this._tab = await getActiveTab(false);
|
||||
tab = await getActiveTab(true) || await getActiveTab(false);
|
||||
}
|
||||
this._running = true;
|
||||
return this._tasks.reduce((pms, args, i, tasks) => {
|
||||
return this._tasks.reduce((pms, task, i) => {
|
||||
return pms.then(
|
||||
result => {
|
||||
if (result === undefined) return getData(this._tab, ...args);
|
||||
this._results[tasks[i - 1]] = result;
|
||||
return getData(this._tab, ...args, result);
|
||||
() => {
|
||||
if (i < from) return;
|
||||
if (i > 0) {
|
||||
let prevTask = this._tasks[i - 1];
|
||||
return task.execute(tab, new ExtractResult(prevTask.results));
|
||||
}
|
||||
return task.execute(tab, undefined);
|
||||
});
|
||||
}, Promise.resolve(undefined)).then(
|
||||
result => {
|
||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
||||
this._running = false;
|
||||
console.log("Tasks are all done.")
|
||||
this.save();
|
||||
}
|
||||
).catch(err => {
|
||||
this._running = false;
|
||||
console.log(err)
|
||||
});
|
||||
}
|
||||
/**
|
||||
* restart from specified task, but don't restart the previous tasks.
|
||||
* @param {number} taskid from which restart the tasks
|
||||
*/
|
||||
async restart(taskid) {
|
||||
if (this._running) {
|
||||
console.log('The Extractor is running. Please wait..');
|
||||
return;
|
||||
}
|
||||
taskid = this._checkTaskId(taskid, 1);
|
||||
if (!taskid) return;
|
||||
if (taskid == 1) {
|
||||
this.start();
|
||||
return;
|
||||
}
|
||||
let cache = this._results[this._tasks[taskid - 2]];
|
||||
if (!cache) {
|
||||
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
|
||||
return;
|
||||
}
|
||||
this._running = true;
|
||||
this._tab = await createTab(parseUrls(cache)[0], false)
|
||||
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
|
||||
return pms.then(
|
||||
result => {
|
||||
this._results[tasks[i - 1]] = result;
|
||||
return getData(this._tab, ...args, result);
|
||||
});
|
||||
}, Promise.resolve(cache)).then(
|
||||
result => {
|
||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
||||
this._running = false;
|
||||
this.save();
|
||||
}
|
||||
() => this.save()
|
||||
).catch(err => {
|
||||
this._running = false;
|
||||
console.log(err)
|
||||
@ -113,18 +76,15 @@ class Extractor {
|
||||
}
|
||||
/**
|
||||
* Save result of a task
|
||||
* @param {number} taskid which task id to save.
|
||||
* @param {number} taskid which task id to save, begins with 0
|
||||
*/
|
||||
save(taskid) {
|
||||
taskid = this._checkTaskId(taskid, this._tasks.length);
|
||||
if (!taskid) return;
|
||||
const result = this._results[this._tasks[taskid - 1]];
|
||||
if (!result) {
|
||||
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
|
||||
return;
|
||||
}
|
||||
if (result.data.length <= 1) { // 1 for selector headers
|
||||
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
|
||||
let id = this._checkTaskId(taskid, this._tasks.length - 1);
|
||||
if (!id) return;
|
||||
let result = new ExtractResult(this._tasks[id].results);
|
||||
|
||||
if (!result.data.length) {
|
||||
console.log(`No result for task #${id}. Forget to call ".start()"?`);
|
||||
return;
|
||||
}
|
||||
let msg = `
|
||||
@ -141,9 +101,9 @@ ${result.toString(50) || "- Empty -"}
|
||||
console.log("No task found.");
|
||||
return 0;
|
||||
}
|
||||
if (defaultId && id === undefined || this.task === null) id = defaultId;
|
||||
if (isNaN(id) || id < 1 || id > this._tasks.length) {
|
||||
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
|
||||
if (defaultId && id === undefined) id = defaultId;
|
||||
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
|
||||
console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
|
||||
return 0;
|
||||
}
|
||||
return id
|
||||
|
||||
66
scripts/background/task.js
Normal file
66
scripts/background/task.js
Normal file
@ -0,0 +1,66 @@
|
||||
class Task {
|
||||
// _manager = undefined;
|
||||
// _id = 0;
|
||||
// _urls = [];
|
||||
_data = {};
|
||||
/**
|
||||
* Create a task.
|
||||
* constructor(itemsSelector:string, fieldSelectors:string[])
|
||||
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
||||
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
|
||||
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||||
* @param {...any} args
|
||||
*/
|
||||
constructor(...args) {
|
||||
if (!testArgs(...args))
|
||||
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
||||
this._itemsSelector = args.shift();
|
||||
this._fieldSelectors = args.shift();
|
||||
this._urls = parseUrls(...args);
|
||||
}
|
||||
get urls() {
|
||||
return this._urls;
|
||||
}
|
||||
get data() {
|
||||
return this._data;
|
||||
}
|
||||
get results() {
|
||||
return this._urls.reduce((p, c) => {
|
||||
return p.concat(this._data[c]);
|
||||
}, []);
|
||||
}
|
||||
clean() {
|
||||
this._data = {};
|
||||
}
|
||||
async execute(tab, upstreamData) {
|
||||
if (!tab) throw new Error("No tab to execute the task.");
|
||||
if (!this._urls.length) {
|
||||
if (upstreamData) {
|
||||
this._urls = parseUrls(upstreamData);
|
||||
} else {
|
||||
this._urls = [await queryUrl(tab)];
|
||||
}
|
||||
}
|
||||
return this._urls.reduce((p, url, i) => p.then(
|
||||
results => {
|
||||
if (i > 0) {
|
||||
if (!MSG_URL_SKIPPED.isEqual(results)) {
|
||||
let lastURL = this._urls[i - 1];
|
||||
this._data[lastURL] = results;
|
||||
}
|
||||
}
|
||||
return this._data[url] ? MSG_URL_SKIPPED : redirectTab(tab, url).then(
|
||||
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
|
||||
);
|
||||
}
|
||||
), Promise.resolve(null)).then(
|
||||
results => {
|
||||
if (!MSG_URL_SKIPPED.isEqual(results)) {
|
||||
let lastURL = this._urls[this._urls.length - 1];
|
||||
this._data[lastURL] = results;
|
||||
return;
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -1,8 +1,11 @@
|
||||
const EXT_NAME = "DataExtracter";
|
||||
|
||||
const URL_REG = getWebUrl();
|
||||
|
||||
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
|
||||
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
|
||||
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
|
||||
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
|
||||
|
||||
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
|
||||
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user