Compare commits

...

5 Commits

Author SHA1 Message Date
f1cf32b83a availablity check before run on active tab 2020-01-11 20:40:25 +08:00
341abebc66 scrollToBottom option 2020-01-11 20:00:52 +08:00
0cf04c3f79 keep state and continue 2020-01-11 09:02:12 +08:00
6134289d0a queryUrl expected url 2020-01-10 16:35:57 +08:00
0e62d914c1 runtime error do not interrupt promise chains 2020-01-10 16:34:46 +08:00
9 changed files with 324 additions and 181 deletions

View File

@ -22,6 +22,7 @@
"scripts/background/result.js",
"scripts/background/signiture.js",
"scripts/background/actions.js",
"scripts/background/task.js",
"scripts/background/extractor.js",
"scripts/background/helpers.js"
],

View File

@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
```
## Advanced Usage
## Stop Tasks
### Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`.
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> If you typed wrong selectors, the task waits forever for elements which don't exists.
But if you typed wrong selectors, the task waits forever for elements which don't exists.
The only way to stop tasks before its finish, is `Closing the host tab`.
### Extract Attributes.
## Extract Attributes.
e.g.: link text and target (use 'selector@attribute')
@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start();
```
## Advanced Usage
### Use Task Chain.
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
```js
new Extractor()
.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
### Continue Tasks
You can always continue tasks (with following), even it stops in the middle of a task:
```js
e.start()
```
The `Extractor` kept the state of last execution, and starts from where it stopped.
### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task?
```js
// restart all tasks
e.restart(0)
// restart from 2nd task
e.restart(1)
```
### Save Result of Any Task
To a multiple task (chain) Extractor `e`:
To a multiple task Extractor `e`:
```js
e = new Extractor()
@ -98,37 +118,12 @@ Incase you want to save it again, use:
e.save()
```
You may want to save another task's result, other than the final:
To save another task result, other than the final one:
```js
// save the result of first task
// to the example above, that is a list of urls
e.save(0)
// save the result of second task
e.save(1)
```
### Restart Tasks
In cases some later task fails, you don't need to restart all task.
Here we have 2 tasks:
```js
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
Suppose the second task fails, we can restart and continue from the task 2:
```js
e.restart(2);
```
If you'd like restart all task, use:
```js
e.start();
// or
e.restart();
```

View File

@ -55,7 +55,7 @@ function parseUrls(...args) {
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v);
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
@ -86,10 +86,10 @@ function redirectTab(tab, url) {
action: ACTION_GOTO_URL,
url: url
}
sendMessage(tab, req, `Goto url: ${url}`);
return sendMessage(tab, req, `Goto url: ${url}`);
}
})
.then(() => queryUrl(tab, curUrl, 'Check if tab url matches expected...'))
.then(() => queryUrl(tab, url, 'Check if tab url matches expected...'))
}
/**
@ -110,32 +110,46 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
}
/**
* get report in from the target tab, usually used to detect if the content script is ready.
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the report in message
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
function reportIn(tab) {
async function ping(tab, count = 1) {
let req = {
action: ACTION_REPORT_IN
}
let cond = r => r == req.action;
return sendMessage(tab, req, 'Check tab availability...', cond);
let pong = await sendMessage(tab, req, 'Check tab availability...', cond, 1000, count).catch(() => { });
return pong == ACTION_REPORT_IN;
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab, urlExcluded, log) {
function queryUrl(tab, expected, log) {
let req = {
action: ACTION_QUERY_URL
}
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
let cond = url => url && (!expected || (expected && expected == url));
return sendMessage(tab, req, log, cond);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function scrollToBottom(tab) {
let req = {
action: ACTION_SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
async function createTab(url, active) {
return new Promise((resolve, reject) => {
chrome.tabs.create({

View File

@ -1,41 +1,45 @@
class Extractor {
constructor() {
constructor(options) {
this._tasks = [];
this._tab = undefined;
this._running = false;
this._results = {};
this._options = options;
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args)) {
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
return this;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
this._tasks.push(new Task(this._options, ...args));
return this;
}
/**
* Clear tasks and caches.
* Clear tasks and task caches.
*/
clear() {
this._tasks = [];
this._results = [];
return this;
}
/**
* Start the task chain.
*/
async start() {
return this._startTasks(0);
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from = 0) {
let id = this._checkTaskId(from, 0);
if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) {
this._tasks[i].clean();
}
return this._startTasks(0);
}
async _startTasks(from) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
@ -44,107 +48,75 @@ class Extractor {
console.log('No task to run.');
return;
}
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) {
let tab;
let task = this._tasks[0];
if (task.urls.length) {
// task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
this._tab = await createTab(urls[0], false);
tab = await createTab(task.urls[0], false);
} else {
this._tab = await getActiveTab(false);
tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
console.log('Cannot contact with active tab.');
return;
}
}
this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => {
return this._tasks.reduce((pms, task, i) => {
return pms.then(
result => {
if (result === undefined) return getData(this._tab, ...args);
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
() => {
if (i < from) return;
if (i > 0) {
let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results));
}
return task.execute(tab, undefined);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
console.log("Tasks are all done.")
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
() => {
this._running = false;
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
console.log(err);
});
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
* @param {number} taskid which task id to save, begins with 0
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return;
}
if (result.data.length <= 1) { // 1 for selector headers
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return;
let results = this._tasks[id].results
results.unshift(this._tasks[id].fieldSelectors);
let exResults = new ExtractResult(results);
if (!results.length) {
console.log(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
let msg = `
Please confirm to download (${result.data.length - 1} items)
Please confirm to download (${results.length - 1} items)
${result.toString(50) || "- Empty -"}
${exResults.toString(50) || "- Empty -"}
`.trim();
if (confirm(msg)) {
saveFile(result, "text/csv");
saveFile(exResults, "text/csv");
}
}
_checkTaskId(id, defaultId) {
if (!this._tasks.length) {
console.log("No task found.");
return 0;
return -1;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
return 0;
if (!isNaN(defaultId) && id === undefined) id = defaultId;
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return -1;
}
return id
}

View File

@ -4,12 +4,15 @@
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval) {
function sendMessage(tab, req, log, cond, interval, limit = 0) {
interval = interval || 500;
limit = limit && !isNaN(limit) ? limit : 0;
count = 0;
return new Promise((resolve, reject) => {
loop();
@ -22,11 +25,17 @@ function sendMessage(tab, req, log, cond, interval) {
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
chrome.tabs.sendMessage(tab.id, req, r => {
if (chrome.runtime.lastError) {
reject(chrome.runtime.lastError.message);
return;
}
// check error but do nothing.
// do not interrupt promise chains even if error, or the task always fail when:
// a tab is newly created, and the content scripts won't have time to initialize
chrome.runtime.lastError;
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {

View File

@ -0,0 +1,78 @@
class Task {
_data = {};
_data_keys = [];
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
get fieldSelectors() {
return this._fieldSelectors;
}
clean() {
this._data = {};
}
async execute(tab, upstreamData) {
if (!tab) return Promise.reject("No tab to execute the task.");
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
urls = [await queryUrl(tab)];
}
}
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[i - 1];
saveResult(results, lastURL);
}
}
if (this._data[url]) return MSG_URL_SKIPPED;
let pms = redirectTab(tab, url);
if (this._options["scrollToBottom"]) {
pms = pms.then(() => scrollToBottom(tab));
}
return pms.then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[urls.length - 1];
saveResult(results, lastURL);
return;
}
}
);
}
}

View File

@ -1,53 +1,101 @@
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
// console.log("Recieved request:",request);
(function () {
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
async function doAction(request, sender) {
switch (request.action) {
case ACTION_EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data);
break;
return data;
case ACTION_GOTO_URL:
window.location.replace(request.url);
if (sendResponse) sendResponse(request.url);
break;
return request.url;
case ACTION_REPORT_IN:
if (sendResponse) sendResponse(request.action);
break;
return request.action;
case ACTION_QUERY_URL:
if (sendResponse) sendResponse(window.location.href);
break;
return window.location.href;
case ACTION_SCROLL_BOTTOM:
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
)
default:
break;
}
}
);
function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound = {};
let items = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
let results = items.map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls));
if (!fieldVals.length) {
return;
function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound = {};
let items = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
let results = items.map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls));
if (!fieldVals.length) {
return;
}
fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
)
}
);
// if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn, detector, log, interval, limit) {
interval = interval || 500;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
)
}
);
// if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}
})();

View File

@ -1,8 +1,12 @@
const EXT_NAME = "DataExtracter";
const URL_REG = getWebUrl();
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");

File diff suppressed because one or more lines are too long