keep state and continue

2020-01-11 09:02:12 +08:00
parent 6134289d0a
commit 0cf04c3f79
7 changed files with 170 additions and 123 deletions
@@ -22,6 +22,7 @@
      "scripts/background/result.js",
      "scripts/background/signiture.js",
      "scripts/background/actions.js",
+      "scripts/background/task.js",
      "scripts/background/extractor.js",
      "scripts/background/helpers.js"
    ],
@@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
 function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
 ```

-## Advanced Usage
+## Stop Tasks

-### Stop Tasks
+The only way to stop tasks before its finish, is `Closing the target tab`.

-Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
+> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.  
+> If you typed wrong selectors, the task waits forever for  elements which don't exists.

-But if you typed wrong selectors, the task waits forever for  elements which don't exists.
-
-The only way to stop tasks before its finish, is `Closing the host tab`.
-
-### Extract Attributes.
+## Extract Attributes.

 e.g.: link text and target (use 'selector@attribute')

@@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute')
 new Extractor().task('.item', ['a', 'a@href']).start();
 ```

+## Advanced Usage
+
 ### Use Task Chain.

 e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link

 ```js
-new Extractor()
-    .task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
+e = new Extractor()
+e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
    .task('list-item', ["a.title", "p.content"])
    .start();
 ```

+### Continue Tasks
+
+You can always continue tasks (with following), even it stops in the middle of a task:
+
+```js
+e.start()
+```
+
+The `Extractor` kept the state of last execution, and starts from where it stopped.
+
+### Restart Tasks
+
+What should I do, if I don't like to continue from last state, but restart from certain task?
+
+```js
+// restart all tasks
+e.restart(0)
+// restart from 2nd task
+e.restart(1)
+```
+
 ### Save Result of Any Task

-To a multiple task (chain) Extractor `e`:
+To a multiple task Extractor `e`:

 ```js
 e = new Extractor()
@@ -98,37 +118,12 @@ Incase you want to save it again, use:
 e.save()
 ```

-You may want to save another task's result, other than the final:
+To save another task result, other than the final one:

 ```js
 // save the result of first task
 // to the example above, that is a list of urls
+e.save(0)
+// save the result of second task
 e.save(1)
 ```
-
-### Restart Tasks
-
-In cases some later task fails, you don't need to restart all task.
-
-Here we have 2 tasks:
-
-```js
-e = new Extractor()
-e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
-    .task('list-item', ["a.title", "p.content"])
-    .start();
-```
-
-Suppose the second task fails, we can restart and continue from the task 2:
-
-```js
-e.restart(2);
-```
-
-If you'd like restart all task, use:
-
-```js
-e.start();
-// or
-e.restart();
-```
@@ -55,7 +55,7 @@ function parseUrls(...args) {
    if (arg instanceof Array) {
        return arg;
    } else if (arg instanceof ExtractResult) {
-        return arg.squash().filter(v => !!v);
+        return arg.squash().filter(v => URL_REG.test(v));
    } else {
        let urlTempl = arg;
        if (urlTempl) {
@@ -1,41 +1,44 @@
 class Extractor {
    constructor() {
        this._tasks = [];
-        this._tab = undefined;
        this._running = false;
-        this._results = {};
    }
    /**
     * Add a task to Extractor. \n
     * One Extractor could has multiple tasks, which orgnized in a task chian.
-     * Later task will use previous task result as input (target url list).
-     * So only the first task can have target url arguments, while later tasks can't.
+     * If url arguments not given within later tasks, they will use previous task result as input (target url list).
     * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
     */
    task(...args) {
-        if (!testArgs(...args)) {
-            console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
-            return this;
-        }
-        // given >2 arguments means the task specifies target page, 
-        // so it won't accept last task result as url list.
-        // in this case, former tasks are useless, can be cleared.
-        if (args.length > 2) this.clear();
-        this._tasks.push(args);
+        this._tasks.push(new Task(...args));
        return this;
    }
    /**
-     * Clear tasks and caches.
+     * Clear tasks and task caches.
     */
    clear() {
        this._tasks = [];
-        this._results = [];
        return this;
    }
    /**
     * Start the task chain.
     */
    async start() {
+        return this._startTasks(0);
+    }
+    /**
+     * restart from specified task, but don't restart the previous tasks.
+     * @param {number} from where to restart the tasks, begins with 0
+     */
+    async restart(from = 0) {
+        let id = this._checkTaskId(from, 0);
+        if (!id) return;
+        for (let i = id; i < this._tasks.length; i++) {
+            this._tasks[i].clean();
+        }
+        return this._startTasks(0);
+    }
+    async _startTasks(from) {
        if (this._running) {
            console.log('The Extractor is running. Please wait..');
            return;
@@ -44,68 +47,28 @@ class Extractor {
            console.log('No task to run.');
            return;
        }
-        let firstTaskArgs = this._tasks[0];
-        if (firstTaskArgs.length > 2) {
+
+        let tab;
+        let task = this._tasks[0];
+        if (task.urls.length) {
            // task specifies target urls, create new tab with first url for it
-            let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
-            this._tab = await createTab(urls[0], false);
+            tab = await createTab(task.urls[0], false);
        } else {
-            this._tab = await getActiveTab(false);
+            tab = await getActiveTab(true) || await getActiveTab(false);
        }
        this._running = true;
-        return this._tasks.reduce((pms, args, i, tasks) => {
+        return this._tasks.reduce((pms, task, i) => {
            return pms.then(
-                result => {
-                    if (result === undefined) return getData(this._tab, ...args);
-                    this._results[tasks[i - 1]] = result;
-                    return getData(this._tab, ...args, result);
+                () => {
+                    if (i < from) return;
+                    if (i > 0) {
+                        let prevTask = this._tasks[i - 1];
+                        return task.execute(tab, new ExtractResult(prevTask.results));
+                    }
+                    return task.execute(tab, undefined);
                });
        }, Promise.resolve(undefined)).then(
-            result => {
-                this._results[this._tasks[this._tasks.length - 1]] = result;
-                this._running = false;
-                console.log("Tasks are all done.")
-                this.save();
-            }
-        ).catch(err => {
-            this._running = false;
-            console.log(err)
-        });
-    }
-    /**
-     * restart from specified task, but don't restart the previous tasks.
-     * @param {number} taskid from which restart the tasks
-     */
-    async restart(taskid) {
-        if (this._running) {
-            console.log('The Extractor is running. Please wait..');
-            return;
-        }
-        taskid = this._checkTaskId(taskid, 1);
-        if (!taskid) return;
-        if (taskid == 1) {
-            this.start();
-            return;
-        }
-        let cache = this._results[this._tasks[taskid - 2]];
-        if (!cache) {
-            console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
-            return;
-        }
-        this._running = true;
-        this._tab = await createTab(parseUrls(cache)[0], false)
-        return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
-            return pms.then(
-                result => {
-                    this._results[tasks[i - 1]] = result;
-                    return getData(this._tab, ...args, result);
-                });
-        }, Promise.resolve(cache)).then(
-            result => {
-                this._results[this._tasks[this._tasks.length - 1]] = result;
-                this._running = false;
-                this.save();
-            }
+            () => this.save()
        ).catch(err => {
            this._running = false;
            console.log(err)
@@ -113,18 +76,15 @@ class Extractor {
    }
    /**
     * Save result of a task
-     * @param {number} taskid which task id to save.
+     * @param {number} taskid which task id to save, begins with 0
     */
    save(taskid) {
-        taskid = this._checkTaskId(taskid, this._tasks.length);
-        if (!taskid) return;
-        const result = this._results[this._tasks[taskid - 1]];
-        if (!result) {
-            console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
-            return;
-        }
-        if (result.data.length <= 1) { // 1 for selector headers
-            console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
+        let id = this._checkTaskId(taskid, this._tasks.length - 1);
+        if (!id) return;
+        let result = new ExtractResult(this._tasks[id].results);
+
+        if (!result.data.length) {
+            console.log(`No result for task #${id}. Forget to call ".start()"?`);
            return;
        }
        let msg = `
@@ -141,9 +101,9 @@ ${result.toString(50) || "- Empty -"}
            console.log("No task found.");
            return 0;
        }
-        if (defaultId && id === undefined || this.task === null) id = defaultId;
-        if (isNaN(id) || id < 1 || id > this._tasks.length) {
-            console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
+        if (defaultId && id === undefined) id = defaultId;
+        if (isNaN(id) || id < 0 || id >= this._tasks.length) {
+            console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
            return 0;
        }
        return id
@@ -0,0 +1,66 @@
+class Task {
+    // _manager = undefined;
+    // _id = 0;
+    // _urls = [];
+    _data = {};
+    /**
+     * Create a task.
+     * constructor(itemsSelector:string, fieldSelectors:string[])
+     * constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
+     * constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
+     * constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
+     * @param {...any} args
+     */
+    constructor(...args) {
+        if (!testArgs(...args))
+            throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
+        this._itemsSelector = args.shift();
+        this._fieldSelectors = args.shift();
+        this._urls = parseUrls(...args);
+    }
+    get urls() {
+        return this._urls;
+    }
+    get data() {
+        return this._data;
+    }
+    get results() {
+        return this._urls.reduce((p, c) => {
+            return p.concat(this._data[c]);
+        }, []);
+    }
+    clean() {
+        this._data = {};
+    }
+    async execute(tab, upstreamData) {
+        if (!tab) throw new Error("No tab to execute the task.");
+        if (!this._urls.length) {
+            if (upstreamData) {
+                this._urls = parseUrls(upstreamData);
+            } else {
+                this._urls = [await queryUrl(tab)];
+            }
+        }
+        return this._urls.reduce((p, url, i) => p.then(
+            results => {
+                if (i > 0) {
+                    if (!MSG_URL_SKIPPED.isEqual(results)) {
+                        let lastURL = this._urls[i - 1];
+                        this._data[lastURL] = results;
+                    } 
+                }
+                return this._data[url] ? MSG_URL_SKIPPED : redirectTab(tab, url).then(
+                    () => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
+                );
+            }
+        ), Promise.resolve(null)).then(
+            results => {
+                if (!MSG_URL_SKIPPED.isEqual(results)) {
+                    let lastURL = this._urls[this._urls.length - 1];
+                    this._data[lastURL] = results;
+                    return;
+                }
+            }
+        );
+    }
+}
@@ -1,8 +1,11 @@
 const EXT_NAME = "DataExtracter";

+const URL_REG = getWebUrl();
+
 const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
 const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
 const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
 const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;

 const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
+const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");