update documents

task chain & management
extractTabData report [] if any field element not found
2018-09-28 14:14:59 +08:00 · 2018-09-28 11:17:05 +08:00 · 2018-09-27 20:57:42 +08:00 · 2018-09-27 18:19:06 +08:00 · 2018-09-27 16:40:05 +08:00
9 changed files with 307 additions and 35 deletions
--- a/images/console.png
+++ b/images/console.png
--- a/images/extnsion.png
+++ b/images/extnsion.png
--- a/manifest.json
+++ b/manifest.json
@ -1,7 +1,7 @@
 {
  "manifest_version": 2,
  "name": "Data Extracter",
-  "version": "0.0.1",
+  "version": "0.1.0",
  "author": "jebbs",
  "description": "Extract data from web page elements as sheet.",
  "icons": {
@ -19,7 +19,8 @@
      "scripts/background.js",
      "scripts/result.js",
      "scripts/tools.js",
-      "scripts/extract.js"
+      "scripts/extract.js",
+      "scripts/extractor.js"
    ],
    "persistent": false
  },
--- a/popup/tip.html
+++ b/popup/tip.html
@ -21,12 +21,11 @@
                <div class="alert alert-info small">
                    <!-- <h6>Usage:</h6> -->
                    <p>
-                        <b>Open console</b> and
-                        <b>switch to Data Extracter</b>, then call the
-                        <b>extract</b> function.
+                        <b>Open console of extesion backgroud page </b> and
+                        type your scripts.
                    </p>
                    <p>
-                        <img src="demo.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
+                        <img src="../images/console.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
                    </p>

                </div>
@ -41,19 +40,21 @@
        <div class="row">
            <div class="col">
                <div class="alert alert-success small">
-                    <p>
-                        <b>View Help</b>:
-                        <br>extract()
-                    </p>
                    <p>
                        <b>Extract current page</b>:
-                        <br>extract("list-item", ["a.title", "p.content"])
+                        <br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
                    </p>
                    <p>
                        <b>Extract multiple pages (1-10, interval 1)</b>:
-                        <br>extract("list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", 1, 10, 1)
+                        <br>new Extractor().task(".list-item", ["a.title", "p.content"],
+                        "http://sample.com/?pn=${page}", 1, 10, 1).start();

                    </p>
+                    <p>
+                        <b>Full document (Right click - Open in new tab):</b>
+                        <br>
+                        <a href="https://git.jebbs.co/jebbs/data-extracter-extesion">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
+                    </p>
                </div>
            </div>
        </div>
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,138 @@
+# DataExtracter Help
+----------------------------
+
+DataExtracter helps you quickly extract data from any web pages. All you need to do is:
+
+- Find out the selectors (JQuery selectors) for target data
+- Call Extractor methods in `extension backgroud page console`, as introduced bellow.
+
+Where is the extension backgroud page console?
+
+Goto <chrome://extensions/> and click `backgroud page` link of the extension
+
+ ![](images/extnsion.png)
+
+In the opening window, find `Console`, and type your scripts.
+
+ ![](images/console.png)
+
+## Qucik Start
+
+
+
+Extract current page
+```js
+new Extractor().task(".list-item", ["a.title", "p.content"]).start();
+```
+
+Extract multiple pages (1-10, interval 1)
+
+```js
+new Extractor().task(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=${page}", 1, 10, 1).start();
+```
+
+Extract multiple urls (list)
+
+```js
+new Extractor().task(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"]).start();
+```
+
+Extract specified pages (1,3,5)
+
+```js
+new Extractor().task(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", [1, 3, 5]).start();
+```
+
+## Extractor.task() Signitures:
+
+```ts
+// a task extracting data from current page
+task(itemsSelector:string, fieldSelectors:string[])
+// a task extracting data from a range of pages
+task(itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
+// a task extracting data from a list of pages
+task(itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
+// a task extracting data from a list of pages
+task(itemsSelector:string, fieldSelectors:string[], urls:string[])
+// a task extracting data of urls which extracted from last task result
+task(itemsSelector:string, fieldSelectors:string[], urls:ExractResult)
+```
+
+## Advanced Usage:
+
+### Stop tasks
+
+The only way to stop tasks before its finish, is `Closing the Tab` which runs tasks.
+
+### Extract attributes. 
+
+e.g.: link text and target (use 'selector@attribute')
+
+```js
+new Extractor().task('.list-item', ['a.title', 'a.title@href']).start();
+```
+
+### Use task chain. 
+
+e.g.: Collect links from `http://sample.com/abc` & Extract data of each link
+
+```js
+new Extractor()
+    .task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
+    .task('list-item', ["a.title", "p.content"])
+    .start();
+```
+
+### Save result of any task
+
+To a multiple task (chain) Extractor `e`:
+
+```js
+e = new Extractor()
+e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
+    .task('list-item', ["a.title", "p.content"])
+    .start();
+```
+
+User will be asked to save  the final result when it finishes.
+
+You may want to save another task's result, other than the final:
+
+```js
+// save the result of first task
+// that is, a list of urls
+e.save(1)
+```
+
+Incase you want to save it again, use:
+
+```js
+e.save()
+```
+
+### Restart tasks
+
+In cases some later task fails, you don't need to restart all task.
+
+Here we have 2 tasks:
+
+```js
+e = new Extractor()
+e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
+    .task('list-item', ["a.title", "p.content"])
+    .start();
+```
+
+Suppose the second task fails, we can restart and continue from the task 2:
+
+```js
+e.restart(2);
+```
+
+If you'd like restart all task, use:
+
+```js
+e.start();
+// or
+e.restart();
+```
--- a/scripts/content.js
+++ b/scripts/content.js
@ -36,12 +36,23 @@ chrome.runtime.onMessage.addListener(
 );

 function extractTabData(itemsSelector, fieldSelectors) {
-    return $(itemsSelector).toArray().map(
-        item => fieldSelectors.map(
-            selector => {
-                let [cls, attr] = selector.split('@').slice(0, 2);
-                return $(item).find(cls).toArray().map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
-            }
-        )
+    let fieldNotFound = false;
+    let results = $(itemsSelector).toArray().map(
+        item => {
+            return fieldSelectors.map(
+                selector => {
+                    let [cls, attr] = selector.split('@').slice(0, 2);
+                    // TODO: close tab to cancel task tip
+                    if (fieldNotFound) return;
+                    let fieldVals = $(item).find(cls).toArray();
+                    if (!fieldVals.length) {
+                        fieldNotFound = true;
+                        return;
+                    }
+                    return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
+                }
+            )
+        }
    );
+    return fieldNotFound ? [] : results
 }
--- a/scripts/extract.js
+++ b/scripts/extract.js
@ -28,7 +28,7 @@ async function getData(itemsSelector, fieldSelectors, ...args) {
        if (arg instanceof Array) {
            urls = arg;
        } else if (arg instanceof ExractResult) {
-            urls = arg.column(0);
+            urls = arg.squash().filter(v => !!v);
        } else {
            let urlTempl = arg;
            if (urlTempl) {
@ -103,7 +103,7 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
        itemsSelector: itemsSelector,
        fieldSelectors: fieldSelectors
    }
-    let cond = r => !!r;
+    let cond = r => r && r.length;
    return sendMessage(tab, req, cond);
 }

@ -139,31 +139,30 @@ function queryUrl(tab, urlExcluded) {
 * @param {object} tab the table where to send the message
 * @param {object} req the request data.
 * @param {function} cond success condition function, r:any=>boolean
- * @param {number} failedTimeOut fail time out
- * @param {number} detectInterval interval for detecting
+ * @param {number} interval interval for detecting
 * @return {Promise} a promise of the response.
 */
-function sendMessage(tab, req, cond, failedTimeOut, detectInterval) {
+function sendMessage(tab, req, cond, interval) {
    req.from = "DataExtracter:" + req.from;
-    failedTimeOut = failedTimeOut || 10000;
-    detectInterval = detectInterval || 500;
+    interval = interval || 500;
    return new Promise((resolve, reject) => {
-        let timeOut;
-        let rejectTimeout = setTimeout(() => {
-            reject(`${req.from} failed after ${failedTimeOut/1000} seconds.`);
-            clearTimeout(timeOut);
-        }, failedTimeOut);
+
        loop();

-        function loop() {
+        async function loop() {
+            console.log("request for", req.from);
+            let tabAvailable = await getTabByID(tab.id);
+            if (!tabAvailable) {
+                throw new Error("Task interupted due to the target tab is closed.");
+            }
+
            chrome.tabs.sendMessage(tab.id, req, r => {
                if (!cond || cond(r)) {
-                    clearTimeout(rejectTimeout);
                    resolve(r);
                } else {
-                    timeOut = setTimeout(() => {
+                    setTimeout(() => {
                        loop();
-                    }, detectInterval);
+                    }, interval);
                }
            });
        }
@ -179,4 +178,12 @@ async function getActiveTab(currentWindow) {
            resolve(tabs[0]);
        })
    })
+}
+
+async function getTabByID(id) {
+    return new Promise((resolve, reject) => {
+        chrome.tabs.get(id, function (tab) {
+            resolve(tab);
+        })
+    })
 }
--- a/scripts/extractor.js
+++ b/scripts/extractor.js
@ -0,0 +1,111 @@
+class Exractor {
+    constructor() {
+        this._tasks = [];
+        this._results = {};
+    }
+    /**
+     * Add a task to Extractor. \n
+     * One Extractor could has multiple tasks, which orgnized in a task chian.
+     * Later task will use previous task result as input (target url list).
+     * So only the first task can have target url arguments, while later tasks can't.
+     * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
+     */
+    task(...args) {
+        if (!testArgs(...args))
+            throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
+        // given >2 arguments means the task specifies target page, 
+        // so it won't accept last task result as url list.
+        // in this case, former tasks are useless, can be cleared.
+        if (args.length > 2) this.clear();
+        this._tasks.push(args);
+        return this;
+    }
+    /**
+     * Clear tasks and caches.
+     */
+    clear() {
+        this._tasks = [];
+        this._results = [];
+    }
+    /**
+     * Start the task chain.
+     */
+    async start() {
+        if (!this._tasks.length) {
+            console.log('No task to run.');
+            return;
+        }
+        return this._tasks.reduce((pms, args, i, tasks) => {
+            return pms.then(
+                result => {
+                    if (result === undefined) return getData(...args);
+                    this._results[tasks[i - 1]] = result;
+                    return getData(...args, result);
+                });
+        }, Promise.resolve(undefined)).then(
+            result => {
+                this._results[this._tasks[this._tasks.length - 1]] = result;
+                this.save();
+            }
+        );
+    }
+    /**
+     * restart from specified task, but don't restart the previous tasks.
+     * @param {number} taskid from which restart the tasks
+     */
+    async restart(taskid) {
+        taskid = this._checkTaskId(taskid, 1);
+        if (!taskid) return;
+        if (taskid == 1) {
+            this.start();
+            return;
+        }
+        let cache = this._results[this._tasks[taskid - 2]];
+        if (!cache) {
+            console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
+            return;
+        }
+        return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
+            return pms.then(
+                result => {
+                    this._results[tasks[i - 1]] = result;
+                    return getData(...args, result);
+                });
+        }, Promise.resolve(cache)).then(
+            result => {
+                this._results[this._tasks[this._tasks.length - 1]] = result;
+                this.save();
+            }
+        );
+    }
+    /**
+     * Save result of a task
+     * @param {number} taskid which task id to save.
+     */
+    save(taskid) {
+        taskid = this._checkTaskId(taskid, this._tasks.length);
+        if (!taskid) return;
+        const result = this._results[this._tasks[taskid - 1]];
+        if (!result) {
+            console.log(`No task result for id (${taskid}). Forget to call ".start()"?`);
+            return;
+        }
+        if (confirm(
+                `Click confirm to download if the sample data looks good (${result.data.length} items)：\n\n${result.toString(50) || "- Empty -"}`
+            )) {
+            saveFile(result, "text/csv");
+        }
+    }
+    _checkTaskId(id, defaultId) {
+        if (!this._tasks.length) {
+            console.log("No task found.");
+            return 0;
+        }
+        if (defaultId && id === undefined || this.task === null) id = defaultId;
+        if (isNaN(id) || id < 1 || id > this._tasks.length) {
+            console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
+            return 0;
+        }
+        return id
+    }
+}
--- a/scripts/result.js
+++ b/scripts/result.js
@ -11,6 +11,9 @@ class ExractResult {
            i => this._data[i][index]
        );
    }
+    squash() {
+        return this._data.reduce((p, c) => p.concat(c), []);
+    }
    get data() {
        return this._data;
    }
Author	SHA1	Message	Date
jebbs	78fa48beb0	update documents	2018-09-28 14:14:59 +08:00
jebbs	93dc249e2a	task chain & management	2018-09-28 11:17:05 +08:00
jebbs	43bb837abd	extractTabData report [] if any field element not found	2018-09-27 20:57:42 +08:00
jebbs	24f2c26cbc	remove fail timeout logic	2018-09-27 18:19:06 +08:00
jebbs	df809f6e60	use all values of ExractResult	2018-09-27 16:40:05 +08:00