Compare commits
5 Commits
33945e49ac
...
78fa48beb0
| Author | SHA1 | Date | |
|---|---|---|---|
| 78fa48beb0 | |||
| 93dc249e2a | |||
| 43bb837abd | |||
| 24f2c26cbc | |||
| df809f6e60 |
BIN
images/console.png
Normal file
BIN
images/console.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 36 KiB |
BIN
images/extnsion.png
Normal file
BIN
images/extnsion.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
@ -1,7 +1,7 @@
|
||||
{
|
||||
"manifest_version": 2,
|
||||
"name": "Data Extracter",
|
||||
"version": "0.0.1",
|
||||
"version": "0.1.0",
|
||||
"author": "jebbs",
|
||||
"description": "Extract data from web page elements as sheet.",
|
||||
"icons": {
|
||||
@ -19,7 +19,8 @@
|
||||
"scripts/background.js",
|
||||
"scripts/result.js",
|
||||
"scripts/tools.js",
|
||||
"scripts/extract.js"
|
||||
"scripts/extract.js",
|
||||
"scripts/extractor.js"
|
||||
],
|
||||
"persistent": false
|
||||
},
|
||||
|
||||
@ -21,12 +21,11 @@
|
||||
<div class="alert alert-info small">
|
||||
<!-- <h6>Usage:</h6> -->
|
||||
<p>
|
||||
<b>Open console</b> and
|
||||
<b>switch to Data Extracter</b>, then call the
|
||||
<b>extract</b> function.
|
||||
<b>Open console of extesion backgroud page </b> and
|
||||
type your scripts.
|
||||
</p>
|
||||
<p>
|
||||
<img src="demo.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
|
||||
<img src="../images/console.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
|
||||
</p>
|
||||
|
||||
</div>
|
||||
@ -41,19 +40,21 @@
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<div class="alert alert-success small">
|
||||
<p>
|
||||
<b>View Help</b>:
|
||||
<br>extract()
|
||||
</p>
|
||||
<p>
|
||||
<b>Extract current page</b>:
|
||||
<br>extract("list-item", ["a.title", "p.content"])
|
||||
<br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
|
||||
</p>
|
||||
<p>
|
||||
<b>Extract multiple pages (1-10, interval 1)</b>:
|
||||
<br>extract("list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", 1, 10, 1)
|
||||
<br>new Extractor().task(".list-item", ["a.title", "p.content"],
|
||||
"http://sample.com/?pn=${page}", 1, 10, 1).start();
|
||||
|
||||
</p>
|
||||
<p>
|
||||
<b>Full document (Right click - Open in new tab):</b>
|
||||
<br>
|
||||
<a href="https://git.jebbs.co/jebbs/data-extracter-extesion">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
138
readme.md
Normal file
138
readme.md
Normal file
@ -0,0 +1,138 @@
|
||||
# DataExtracter Help
|
||||
----------------------------
|
||||
|
||||
DataExtracter helps you quickly extract data from any web pages. All you need to do is:
|
||||
|
||||
- Find out the selectors (JQuery selectors) for target data
|
||||
- Call Extractor methods in `extension backgroud page console`, as introduced bellow.
|
||||
|
||||
Where is the extension backgroud page console?
|
||||
|
||||
Goto <chrome://extensions/> and click `backgroud page` link of the extension
|
||||
|
||||

|
||||
|
||||
In the opening window, find `Console`, and type your scripts.
|
||||
|
||||

|
||||
|
||||
## Qucik Start
|
||||
|
||||
|
||||
|
||||
Extract current page
|
||||
```js
|
||||
new Extractor().task(".list-item", ["a.title", "p.content"]).start();
|
||||
```
|
||||
|
||||
Extract multiple pages (1-10, interval 1)
|
||||
|
||||
```js
|
||||
new Extractor().task(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=${page}", 1, 10, 1).start();
|
||||
```
|
||||
|
||||
Extract multiple urls (list)
|
||||
|
||||
```js
|
||||
new Extractor().task(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"]).start();
|
||||
```
|
||||
|
||||
Extract specified pages (1,3,5)
|
||||
|
||||
```js
|
||||
new Extractor().task(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", [1, 3, 5]).start();
|
||||
```
|
||||
|
||||
## Extractor.task() Signitures:
|
||||
|
||||
```ts
|
||||
// a task extracting data from current page
|
||||
task(itemsSelector:string, fieldSelectors:string[])
|
||||
// a task extracting data from a range of pages
|
||||
task(itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
|
||||
// a task extracting data from a list of pages
|
||||
task(itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
|
||||
// a task extracting data from a list of pages
|
||||
task(itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||||
// a task extracting data of urls which extracted from last task result
|
||||
task(itemsSelector:string, fieldSelectors:string[], urls:ExractResult)
|
||||
```
|
||||
|
||||
## Advanced Usage:
|
||||
|
||||
### Stop tasks
|
||||
|
||||
The only way to stop tasks before its finish, is `Closing the Tab` which runs tasks.
|
||||
|
||||
### Extract attributes.
|
||||
|
||||
e.g.: link text and target (use 'selector@attribute')
|
||||
|
||||
```js
|
||||
new Extractor().task('.list-item', ['a.title', 'a.title@href']).start();
|
||||
```
|
||||
|
||||
### Use task chain.
|
||||
|
||||
e.g.: Collect links from `http://sample.com/abc` & Extract data of each link
|
||||
|
||||
```js
|
||||
new Extractor()
|
||||
.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
|
||||
.task('list-item', ["a.title", "p.content"])
|
||||
.start();
|
||||
```
|
||||
|
||||
### Save result of any task
|
||||
|
||||
To a multiple task (chain) Extractor `e`:
|
||||
|
||||
```js
|
||||
e = new Extractor()
|
||||
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
|
||||
.task('list-item', ["a.title", "p.content"])
|
||||
.start();
|
||||
```
|
||||
|
||||
User will be asked to save the final result when it finishes.
|
||||
|
||||
You may want to save another task's result, other than the final:
|
||||
|
||||
```js
|
||||
// save the result of first task
|
||||
// that is, a list of urls
|
||||
e.save(1)
|
||||
```
|
||||
|
||||
Incase you want to save it again, use:
|
||||
|
||||
```js
|
||||
e.save()
|
||||
```
|
||||
|
||||
### Restart tasks
|
||||
|
||||
In cases some later task fails, you don't need to restart all task.
|
||||
|
||||
Here we have 2 tasks:
|
||||
|
||||
```js
|
||||
e = new Extractor()
|
||||
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
|
||||
.task('list-item', ["a.title", "p.content"])
|
||||
.start();
|
||||
```
|
||||
|
||||
Suppose the second task fails, we can restart and continue from the task 2:
|
||||
|
||||
```js
|
||||
e.restart(2);
|
||||
```
|
||||
|
||||
If you'd like restart all task, use:
|
||||
|
||||
```js
|
||||
e.start();
|
||||
// or
|
||||
e.restart();
|
||||
```
|
||||
@ -36,12 +36,23 @@ chrome.runtime.onMessage.addListener(
|
||||
);
|
||||
|
||||
function extractTabData(itemsSelector, fieldSelectors) {
|
||||
return $(itemsSelector).toArray().map(
|
||||
item => fieldSelectors.map(
|
||||
selector => {
|
||||
let [cls, attr] = selector.split('@').slice(0, 2);
|
||||
return $(item).find(cls).toArray().map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
|
||||
}
|
||||
)
|
||||
let fieldNotFound = false;
|
||||
let results = $(itemsSelector).toArray().map(
|
||||
item => {
|
||||
return fieldSelectors.map(
|
||||
selector => {
|
||||
let [cls, attr] = selector.split('@').slice(0, 2);
|
||||
// TODO: close tab to cancel task tip
|
||||
if (fieldNotFound) return;
|
||||
let fieldVals = $(item).find(cls).toArray();
|
||||
if (!fieldVals.length) {
|
||||
fieldNotFound = true;
|
||||
return;
|
||||
}
|
||||
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
|
||||
}
|
||||
)
|
||||
}
|
||||
);
|
||||
return fieldNotFound ? [] : results
|
||||
}
|
||||
@ -28,7 +28,7 @@ async function getData(itemsSelector, fieldSelectors, ...args) {
|
||||
if (arg instanceof Array) {
|
||||
urls = arg;
|
||||
} else if (arg instanceof ExractResult) {
|
||||
urls = arg.column(0);
|
||||
urls = arg.squash().filter(v => !!v);
|
||||
} else {
|
||||
let urlTempl = arg;
|
||||
if (urlTempl) {
|
||||
@ -103,7 +103,7 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
|
||||
itemsSelector: itemsSelector,
|
||||
fieldSelectors: fieldSelectors
|
||||
}
|
||||
let cond = r => !!r;
|
||||
let cond = r => r && r.length;
|
||||
return sendMessage(tab, req, cond);
|
||||
}
|
||||
|
||||
@ -139,31 +139,30 @@ function queryUrl(tab, urlExcluded) {
|
||||
* @param {object} tab the table where to send the message
|
||||
* @param {object} req the request data.
|
||||
* @param {function} cond success condition function, r:any=>boolean
|
||||
* @param {number} failedTimeOut fail time out
|
||||
* @param {number} detectInterval interval for detecting
|
||||
* @param {number} interval interval for detecting
|
||||
* @return {Promise} a promise of the response.
|
||||
*/
|
||||
function sendMessage(tab, req, cond, failedTimeOut, detectInterval) {
|
||||
function sendMessage(tab, req, cond, interval) {
|
||||
req.from = "DataExtracter:" + req.from;
|
||||
failedTimeOut = failedTimeOut || 10000;
|
||||
detectInterval = detectInterval || 500;
|
||||
interval = interval || 500;
|
||||
return new Promise((resolve, reject) => {
|
||||
let timeOut;
|
||||
let rejectTimeout = setTimeout(() => {
|
||||
reject(`${req.from} failed after ${failedTimeOut/1000} seconds.`);
|
||||
clearTimeout(timeOut);
|
||||
}, failedTimeOut);
|
||||
|
||||
loop();
|
||||
|
||||
function loop() {
|
||||
async function loop() {
|
||||
console.log("request for", req.from);
|
||||
let tabAvailable = await getTabByID(tab.id);
|
||||
if (!tabAvailable) {
|
||||
throw new Error("Task interupted due to the target tab is closed.");
|
||||
}
|
||||
|
||||
chrome.tabs.sendMessage(tab.id, req, r => {
|
||||
if (!cond || cond(r)) {
|
||||
clearTimeout(rejectTimeout);
|
||||
resolve(r);
|
||||
} else {
|
||||
timeOut = setTimeout(() => {
|
||||
setTimeout(() => {
|
||||
loop();
|
||||
}, detectInterval);
|
||||
}, interval);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -179,4 +178,12 @@ async function getActiveTab(currentWindow) {
|
||||
resolve(tabs[0]);
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async function getTabByID(id) {
|
||||
return new Promise((resolve, reject) => {
|
||||
chrome.tabs.get(id, function (tab) {
|
||||
resolve(tab);
|
||||
})
|
||||
})
|
||||
}
|
||||
111
scripts/extractor.js
Normal file
111
scripts/extractor.js
Normal file
@ -0,0 +1,111 @@
|
||||
class Exractor {
|
||||
constructor() {
|
||||
this._tasks = [];
|
||||
this._results = {};
|
||||
}
|
||||
/**
|
||||
* Add a task to Extractor. \n
|
||||
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
||||
* Later task will use previous task result as input (target url list).
|
||||
* So only the first task can have target url arguments, while later tasks can't.
|
||||
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
||||
*/
|
||||
task(...args) {
|
||||
if (!testArgs(...args))
|
||||
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
||||
// given >2 arguments means the task specifies target page,
|
||||
// so it won't accept last task result as url list.
|
||||
// in this case, former tasks are useless, can be cleared.
|
||||
if (args.length > 2) this.clear();
|
||||
this._tasks.push(args);
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Clear tasks and caches.
|
||||
*/
|
||||
clear() {
|
||||
this._tasks = [];
|
||||
this._results = [];
|
||||
}
|
||||
/**
|
||||
* Start the task chain.
|
||||
*/
|
||||
async start() {
|
||||
if (!this._tasks.length) {
|
||||
console.log('No task to run.');
|
||||
return;
|
||||
}
|
||||
return this._tasks.reduce((pms, args, i, tasks) => {
|
||||
return pms.then(
|
||||
result => {
|
||||
if (result === undefined) return getData(...args);
|
||||
this._results[tasks[i - 1]] = result;
|
||||
return getData(...args, result);
|
||||
});
|
||||
}, Promise.resolve(undefined)).then(
|
||||
result => {
|
||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
||||
this.save();
|
||||
}
|
||||
);
|
||||
}
|
||||
/**
|
||||
* restart from specified task, but don't restart the previous tasks.
|
||||
* @param {number} taskid from which restart the tasks
|
||||
*/
|
||||
async restart(taskid) {
|
||||
taskid = this._checkTaskId(taskid, 1);
|
||||
if (!taskid) return;
|
||||
if (taskid == 1) {
|
||||
this.start();
|
||||
return;
|
||||
}
|
||||
let cache = this._results[this._tasks[taskid - 2]];
|
||||
if (!cache) {
|
||||
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
|
||||
return;
|
||||
}
|
||||
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
|
||||
return pms.then(
|
||||
result => {
|
||||
this._results[tasks[i - 1]] = result;
|
||||
return getData(...args, result);
|
||||
});
|
||||
}, Promise.resolve(cache)).then(
|
||||
result => {
|
||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
||||
this.save();
|
||||
}
|
||||
);
|
||||
}
|
||||
/**
|
||||
* Save result of a task
|
||||
* @param {number} taskid which task id to save.
|
||||
*/
|
||||
save(taskid) {
|
||||
taskid = this._checkTaskId(taskid, this._tasks.length);
|
||||
if (!taskid) return;
|
||||
const result = this._results[this._tasks[taskid - 1]];
|
||||
if (!result) {
|
||||
console.log(`No task result for id (${taskid}). Forget to call ".start()"?`);
|
||||
return;
|
||||
}
|
||||
if (confirm(
|
||||
`Click confirm to download if the sample data looks good (${result.data.length} items):\n\n${result.toString(50) || "- Empty -"}`
|
||||
)) {
|
||||
saveFile(result, "text/csv");
|
||||
}
|
||||
}
|
||||
_checkTaskId(id, defaultId) {
|
||||
if (!this._tasks.length) {
|
||||
console.log("No task found.");
|
||||
return 0;
|
||||
}
|
||||
if (defaultId && id === undefined || this.task === null) id = defaultId;
|
||||
if (isNaN(id) || id < 1 || id > this._tasks.length) {
|
||||
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
|
||||
return 0;
|
||||
}
|
||||
return id
|
||||
}
|
||||
}
|
||||
@ -11,6 +11,9 @@ class ExractResult {
|
||||
i => this._data[i][index]
|
||||
);
|
||||
}
|
||||
squash() {
|
||||
return this._data.reduce((p, c) => p.concat(c), []);
|
||||
}
|
||||
get data() {
|
||||
return this._data;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user