Compare commits

...

5 Commits

Author SHA1 Message Date
f1cf32b83a availablity check before run on active tab 2020-01-11 20:40:25 +08:00
341abebc66 scrollToBottom option 2020-01-11 20:00:52 +08:00
0cf04c3f79 keep state and continue 2020-01-11 09:02:12 +08:00
6134289d0a queryUrl expected url 2020-01-10 16:35:57 +08:00
0e62d914c1 runtime error do not interrupt promise chains 2020-01-10 16:34:46 +08:00
9 changed files with 324 additions and 181 deletions

View File

@ -22,6 +22,7 @@
"scripts/background/result.js", "scripts/background/result.js",
"scripts/background/signiture.js", "scripts/background/signiture.js",
"scripts/background/actions.js", "scripts/background/actions.js",
"scripts/background/task.js",
"scripts/background/extractor.js", "scripts/background/extractor.js",
"scripts/background/helpers.js" "scripts/background/helpers.js"
], ],

View File

@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult) function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
``` ```
## Advanced Usage ## Stop Tasks
### Stop Tasks The only way to stop tasks before its finish, is `Closing the target tab`.
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously. > Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> If you typed wrong selectors, the task waits forever for elements which don't exists.
But if you typed wrong selectors, the task waits forever for elements which don't exists. ## Extract Attributes.
The only way to stop tasks before its finish, is `Closing the host tab`.
### Extract Attributes.
e.g.: link text and target (use 'selector@attribute') e.g.: link text and target (use 'selector@attribute')
@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start(); new Extractor().task('.item', ['a', 'a@href']).start();
``` ```
## Advanced Usage
### Use Task Chain. ### Use Task Chain.
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
```js ```js
new Extractor() e = new Extractor()
.task('.search-list-item', ['a@href'], ["http://sample.com/abc"]) e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]) .task('list-item', ["a.title", "p.content"])
.start(); .start();
``` ```
### Continue Tasks
You can always continue tasks (with following), even it stops in the middle of a task:
```js
e.start()
```
The `Extractor` kept the state of last execution, and starts from where it stopped.
### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task?
```js
// restart all tasks
e.restart(0)
// restart from 2nd task
e.restart(1)
```
### Save Result of Any Task ### Save Result of Any Task
To a multiple task (chain) Extractor `e`: To a multiple task Extractor `e`:
```js ```js
e = new Extractor() e = new Extractor()
@ -98,37 +118,12 @@ Incase you want to save it again, use:
e.save() e.save()
``` ```
You may want to save another task's result, other than the final: To save another task result, other than the final one:
```js ```js
// save the result of first task // save the result of first task
// to the example above, that is a list of urls // to the example above, that is a list of urls
e.save(0)
// save the result of second task
e.save(1) e.save(1)
``` ```
### Restart Tasks
In cases some later task fails, you don't need to restart all task.
Here we have 2 tasks:
```js
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
Suppose the second task fails, we can restart and continue from the task 2:
```js
e.restart(2);
```
If you'd like restart all task, use:
```js
e.start();
// or
e.restart();
```

View File

@ -55,7 +55,7 @@ function parseUrls(...args) {
if (arg instanceof Array) { if (arg instanceof Array) {
return arg; return arg;
} else if (arg instanceof ExtractResult) { } else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v); return arg.squash().filter(v => URL_REG.test(v));
} else { } else {
let urlTempl = arg; let urlTempl = arg;
if (urlTempl) { if (urlTempl) {
@ -86,10 +86,10 @@ function redirectTab(tab, url) {
action: ACTION_GOTO_URL, action: ACTION_GOTO_URL,
url: url url: url
} }
sendMessage(tab, req, `Goto url: ${url}`); return sendMessage(tab, req, `Goto url: ${url}`);
} }
}) })
.then(() => queryUrl(tab, curUrl, 'Check if tab url matches expected...')) .then(() => queryUrl(tab, url, 'Check if tab url matches expected...'))
} }
/** /**
@ -110,32 +110,46 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
} }
/** /**
* get report in from the target tab, usually used to detect if the content script is ready. * ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab * @param {any} tab target tab
* @returns {Promise<string>} a promise of the report in message * @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/ */
function reportIn(tab) { async function ping(tab, count = 1) {
let req = { let req = {
action: ACTION_REPORT_IN action: ACTION_REPORT_IN
} }
let cond = r => r == req.action; let cond = r => r == req.action;
return sendMessage(tab, req, 'Check tab availability...', cond); let pong = await sendMessage(tab, req, 'Check tab availability...', cond, 1000, count).catch(() => { });
return pong == ACTION_REPORT_IN;
} }
/** /**
* get the url of the target tab * get the url of the target tab
* @param {any} tab target tab * @param {any} tab target tab
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded * @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url * @returns {Promise<string>} a promise of the url
*/ */
function queryUrl(tab, urlExcluded, log) { function queryUrl(tab, expected, log) {
let req = { let req = {
action: ACTION_QUERY_URL action: ACTION_QUERY_URL
} }
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url)); let cond = url => url && (!expected || (expected && expected == url));
return sendMessage(tab, req, log, cond); return sendMessage(tab, req, log, cond);
} }
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function scrollToBottom(tab) {
let req = {
action: ACTION_SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
async function createTab(url, active) { async function createTab(url, active) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.create({ chrome.tabs.create({

View File

@ -1,41 +1,45 @@
class Extractor { class Extractor {
constructor() { constructor(options) {
this._tasks = []; this._tasks = [];
this._tab = undefined;
this._running = false; this._running = false;
this._results = {}; this._options = options;
} }
/** /**
* Add a task to Extractor. \n * Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian. * One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list). * If url arguments not given within later tasks, they will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls. * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/ */
task(...args) { task(...args) {
if (!testArgs(...args)) { this._tasks.push(new Task(this._options, ...args));
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
return this;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
return this; return this;
} }
/** /**
* Clear tasks and caches. * Clear tasks and task caches.
*/ */
clear() { clear() {
this._tasks = []; this._tasks = [];
this._results = [];
return this; return this;
} }
/** /**
* Start the task chain. * Start the task chain.
*/ */
async start() { async start() {
return this._startTasks(0);
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from = 0) {
let id = this._checkTaskId(from, 0);
if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) {
this._tasks[i].clean();
}
return this._startTasks(0);
}
async _startTasks(from) {
if (this._running) { if (this._running) {
console.log('The Extractor is running. Please wait..'); console.log('The Extractor is running. Please wait..');
return; return;
@ -44,107 +48,75 @@ class Extractor {
console.log('No task to run.'); console.log('No task to run.');
return; return;
} }
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) { let tab;
let task = this._tasks[0];
if (task.urls.length) {
// task specifies target urls, create new tab with first url for it // task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length)); tab = await createTab(task.urls[0], false);
this._tab = await createTab(urls[0], false);
} else { } else {
this._tab = await getActiveTab(false); tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
console.log('Cannot contact with active tab.');
return;
}
} }
this._running = true; this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => { return this._tasks.reduce((pms, task, i) => {
return pms.then( return pms.then(
result => { () => {
if (result === undefined) return getData(this._tab, ...args); if (i < from) return;
this._results[tasks[i - 1]] = result; if (i > 0) {
return getData(this._tab, ...args, result); let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results));
}
return task.execute(tab, undefined);
}); });
}, Promise.resolve(undefined)).then( }, Promise.resolve(undefined)).then(
result => { () => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
console.log("Tasks are all done.")
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false; this._running = false;
this.save(); this.save();
} }
).catch(err => { ).catch(err => {
this._running = false; this._running = false;
console.log(err) console.log(err);
}); });
} }
/** /**
* Save result of a task * Save result of a task
* @param {number} taskid which task id to save. * @param {number} taskid which task id to save, begins with 0
*/ */
save(taskid) { save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length); let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (!taskid) return; if (id < 0) return;
const result = this._results[this._tasks[taskid - 1]]; let results = this._tasks[id].results
if (!result) { results.unshift(this._tasks[id].fieldSelectors);
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return; let exResults = new ExtractResult(results);
}
if (result.data.length <= 1) { // 1 for selector headers if (!results.length) {
console.log(`No result for task #${taskid}. Forget to call ".start()"?`); console.log(`No result for task #${id}. Forget to call ".start()"?`);
return; return;
} }
let msg = ` let msg = `
Please confirm to download (${result.data.length - 1} items) Please confirm to download (${results.length - 1} items)
${result.toString(50) || "- Empty -"} ${exResults.toString(50) || "- Empty -"}
`.trim(); `.trim();
if (confirm(msg)) { if (confirm(msg)) {
saveFile(result, "text/csv"); saveFile(exResults, "text/csv");
} }
} }
_checkTaskId(id, defaultId) { _checkTaskId(id, defaultId) {
if (!this._tasks.length) { if (!this._tasks.length) {
console.log("No task found."); console.log("No task found.");
return 0; return -1;
} }
if (defaultId && id === undefined || this.task === null) id = defaultId; if (!isNaN(defaultId) && id === undefined) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) { if (isNaN(id) || id < 0 || id >= this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`); console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return 0; return -1;
} }
return id return id
} }

View File

@ -4,12 +4,15 @@
* @param {object} tab the table where to send the message * @param {object} tab the table where to send the message
* @param {object} req the request data. * @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean * @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting * @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console. * @param {string} log messages logged to console.
* @return {Promise} a promise of the response. * @return {Promise} a promise of the response.
*/ */
function sendMessage(tab, req, log, cond, interval) { function sendMessage(tab, req, log, cond, interval, limit = 0) {
interval = interval || 500; interval = interval || 500;
limit = limit && !isNaN(limit) ? limit : 0;
count = 0;
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
loop(); loop();
@ -22,11 +25,17 @@ function sendMessage(tab, req, log, cond, interval) {
return; return;
} }
chrome.tabs.sendMessage(tab.id, req, r => { if (limit && count >= limit) {
if (chrome.runtime.lastError) { reject(`sendMessage loop limit ${limit} reached.`);
reject(chrome.runtime.lastError.message);
return; return;
} }
count++;
chrome.tabs.sendMessage(tab.id, req, r => {
// check error but do nothing.
// do not interrupt promise chains even if error, or the task always fail when:
// a tab is newly created, and the content scripts won't have time to initialize
chrome.runtime.lastError;
let flag = !cond || cond(r); let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)'); if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) { if (flag) {

View File

@ -0,0 +1,78 @@
class Task {
_data = {};
_data_keys = [];
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
get fieldSelectors() {
return this._fieldSelectors;
}
clean() {
this._data = {};
}
async execute(tab, upstreamData) {
if (!tab) return Promise.reject("No tab to execute the task.");
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
urls = [await queryUrl(tab)];
}
}
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[i - 1];
saveResult(results, lastURL);
}
}
if (this._data[url]) return MSG_URL_SKIPPED;
let pms = redirectTab(tab, url);
if (this._options["scrollToBottom"]) {
pms = pms.then(() => scrollToBottom(tab));
}
return pms.then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[urls.length - 1];
saveResult(results, lastURL);
return;
}
}
);
}
}

View File

@ -1,27 +1,38 @@
chrome.runtime.onMessage.addListener( (function () {
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) { function (request, sender, sendResponse) {
if (!request.action) return; if (!request.action) return;
// console.log("Recieved request:",request); // console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
async function doAction(request, sender) {
switch (request.action) { switch (request.action) {
case ACTION_EXTRACT: case ACTION_EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors); let data = extract(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data); return data;
break;
case ACTION_GOTO_URL: case ACTION_GOTO_URL:
window.location.replace(request.url); window.location.replace(request.url);
if (sendResponse) sendResponse(request.url); return request.url;
break;
case ACTION_REPORT_IN: case ACTION_REPORT_IN:
if (sendResponse) sendResponse(request.action); return request.action;
break;
case ACTION_QUERY_URL: case ACTION_QUERY_URL:
if (sendResponse) sendResponse(window.location.href); return window.location.href;
break; case ACTION_SCROLL_BOTTOM:
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
)
default: default:
break; break;
} }
} }
);
function extract(itemsSelector, fieldSelectors) { function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously. // since some elements may be loaded asynchronously.
@ -51,3 +62,40 @@ function extract(itemsSelector, fieldSelectors) {
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false); let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
} }
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn, detector, log, interval, limit) {
interval = interval || 500;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}
})();

View File

@ -1,8 +1,12 @@
const EXT_NAME = "DataExtracter"; const EXT_NAME = "DataExtracter";
const URL_REG = getWebUrl();
const ACTION_EXTRACT = `${EXT_NAME}:Extract`; const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`; const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`; const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`; const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet"); const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");

File diff suppressed because one or more lines are too long