Compare commits

...

5 Commits

Author SHA1 Message Date
78fa48beb0 update documents 2018-09-28 14:14:59 +08:00
93dc249e2a task chain & management 2018-09-28 11:17:05 +08:00
43bb837abd extractTabData report [] if any field element not found 2018-09-27 20:57:42 +08:00
24f2c26cbc remove fail timeout logic 2018-09-27 18:19:06 +08:00
df809f6e60 use all values of ExractResult 2018-09-27 16:40:05 +08:00
9 changed files with 307 additions and 35 deletions

BIN
images/console.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

BIN
images/extnsion.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@ -1,7 +1,7 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.0.1",
"version": "0.1.0",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
@ -19,7 +19,8 @@
"scripts/background.js",
"scripts/result.js",
"scripts/tools.js",
"scripts/extract.js"
"scripts/extract.js",
"scripts/extractor.js"
],
"persistent": false
},

View File

@ -21,12 +21,11 @@
<div class="alert alert-info small">
<!-- <h6>Usage:</h6> -->
<p>
<b>Open console</b> and
<b>switch to Data Extracter</b>, then call the
<b>extract</b> function.
<b>Open console of extesion backgroud page </b> and
type your scripts.
</p>
<p>
<img src="demo.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
<img src="../images/console.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
</p>
</div>
@ -41,19 +40,21 @@
<div class="row">
<div class="col">
<div class="alert alert-success small">
<p>
<b>View Help</b>:
<br>extract()
</p>
<p>
<b>Extract current page</b>:
<br>extract("list-item", ["a.title", "p.content"])
<br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
</p>
<p>
<b>Extract multiple pages (1-10, interval 1)</b>:
<br>extract("list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", 1, 10, 1)
<br>new Extractor().task(".list-item", ["a.title", "p.content"],
"http://sample.com/?pn=${page}", 1, 10, 1).start();
</p>
<p>
<b>Full document (Right click - Open in new tab):</b>
<br>
<a href="https://git.jebbs.co/jebbs/data-extracter-extesion">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
</p>
</div>
</div>
</div>

138
readme.md Normal file
View File

@ -0,0 +1,138 @@
# DataExtracter Help
----------------------------
DataExtracter helps you quickly extract data from any web pages. All you need to do is:
- Find out the selectors (JQuery selectors) for target data
- Call Extractor methods in `extension backgroud page console`, as introduced bellow.
Where is the extension backgroud page console?
Goto <chrome://extensions/> and click `backgroud page` link of the extension
![](images/extnsion.png)
In the opening window, find `Console`, and type your scripts.
![](images/console.png)
## Qucik Start
Extract current page
```js
new Extractor().task(".list-item", ["a.title", "p.content"]).start();
```
Extract multiple pages (1-10, interval 1)
```js
new Extractor().task(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=${page}", 1, 10, 1).start();
```
Extract multiple urls (list)
```js
new Extractor().task(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"]).start();
```
Extract specified pages (1,3,5)
```js
new Extractor().task(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", [1, 3, 5]).start();
```
## Extractor.task() Signitures:
```ts
// a task extracting data from current page
task(itemsSelector:string, fieldSelectors:string[])
// a task extracting data from a range of pages
task(itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
// a task extracting data from a list of pages
task(itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
// a task extracting data from a list of pages
task(itemsSelector:string, fieldSelectors:string[], urls:string[])
// a task extracting data of urls which extracted from last task result
task(itemsSelector:string, fieldSelectors:string[], urls:ExractResult)
```
## Advanced Usage:
### Stop tasks
The only way to stop tasks before its finish, is `Closing the Tab` which runs tasks.
### Extract attributes.
e.g.: link text and target (use 'selector@attribute')
```js
new Extractor().task('.list-item', ['a.title', 'a.title@href']).start();
```
### Use task chain.
e.g.: Collect links from `http://sample.com/abc` & Extract data of each link
```js
new Extractor()
.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
### Save result of any task
To a multiple task (chain) Extractor `e`:
```js
e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
User will be asked to save the final result when it finishes.
You may want to save another task's result, other than the final:
```js
// save the result of first task
// that is, a list of urls
e.save(1)
```
Incase you want to save it again, use:
```js
e.save()
```
### Restart tasks
In cases some later task fails, you don't need to restart all task.
Here we have 2 tasks:
```js
e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
Suppose the second task fails, we can restart and continue from the task 2:
```js
e.restart(2);
```
If you'd like restart all task, use:
```js
e.start();
// or
e.restart();
```

View File

@ -36,12 +36,23 @@ chrome.runtime.onMessage.addListener(
);
function extractTabData(itemsSelector, fieldSelectors) {
return $(itemsSelector).toArray().map(
item => fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
return $(item).find(cls).toArray().map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
let fieldNotFound = false;
let results = $(itemsSelector).toArray().map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
// TODO: close tab to cancel task tip
if (fieldNotFound) return;
let fieldVals = $(item).find(cls).toArray();
if (!fieldVals.length) {
fieldNotFound = true;
return;
}
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
return fieldNotFound ? [] : results
}

View File

@ -28,7 +28,7 @@ async function getData(itemsSelector, fieldSelectors, ...args) {
if (arg instanceof Array) {
urls = arg;
} else if (arg instanceof ExractResult) {
urls = arg.column(0);
urls = arg.squash().filter(v => !!v);
} else {
let urlTempl = arg;
if (urlTempl) {
@ -103,7 +103,7 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let cond = r => !!r;
let cond = r => r && r.length;
return sendMessage(tab, req, cond);
}
@ -139,31 +139,30 @@ function queryUrl(tab, urlExcluded) {
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} failedTimeOut fail time out
* @param {number} detectInterval interval for detecting
* @param {number} interval interval for detecting
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, cond, failedTimeOut, detectInterval) {
function sendMessage(tab, req, cond, interval) {
req.from = "DataExtracter:" + req.from;
failedTimeOut = failedTimeOut || 10000;
detectInterval = detectInterval || 500;
interval = interval || 500;
return new Promise((resolve, reject) => {
let timeOut;
let rejectTimeout = setTimeout(() => {
reject(`${req.from} failed after ${failedTimeOut/1000} seconds.`);
clearTimeout(timeOut);
}, failedTimeOut);
loop();
function loop() {
async function loop() {
console.log("request for", req.from);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
throw new Error("Task interupted due to the target tab is closed.");
}
chrome.tabs.sendMessage(tab.id, req, r => {
if (!cond || cond(r)) {
clearTimeout(rejectTimeout);
resolve(r);
} else {
timeOut = setTimeout(() => {
setTimeout(() => {
loop();
}, detectInterval);
}, interval);
}
});
}
@ -179,4 +178,12 @@ async function getActiveTab(currentWindow) {
resolve(tabs[0]);
})
})
}
async function getTabByID(id) {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
resolve(tab);
})
})
}

111
scripts/extractor.js Normal file
View File

@ -0,0 +1,111 @@
class Exractor {
constructor() {
this._tasks = [];
this._results = {};
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
return this;
}
/**
* Clear tasks and caches.
*/
clear() {
this._tasks = [];
this._results = [];
}
/**
* Start the task chain.
*/
async start() {
if (!this._tasks.length) {
console.log('No task to run.');
return;
}
return this._tasks.reduce((pms, args, i, tasks) => {
return pms.then(
result => {
if (result === undefined) return getData(...args);
this._results[tasks[i - 1]] = result;
return getData(...args, result);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this.save();
}
);
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this.save();
}
);
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No task result for id (${taskid}). Forget to call ".start()"?`);
return;
}
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
_checkTaskId(id, defaultId) {
if (!this._tasks.length) {
console.log("No task found.");
return 0;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
return 0;
}
return id
}
}

View File

@ -11,6 +11,9 @@ class ExractResult {
i => this._data[i][index]
);
}
squash() {
return this._data.reduce((p, c) => p.concat(c), []);
}
get data() {
return this._data;
}