Compare commits

..

3 Commits

Author SHA1 Message Date
c504942144 wait for elements 2020-01-10 15:35:18 +08:00
4656e4ff64 helper function $ 2020-01-10 13:22:37 +08:00
26c6c1159e refactoring 2020-01-10 12:07:21 +08:00
18 changed files with 236 additions and 233 deletions

View File

@ -16,24 +16,27 @@
}, },
"background": { "background": {
"scripts": [ "scripts": [
"scripts/background.js", "scripts/shared/tools.js",
"scripts/result.js", "scripts/shared/common.js",
"scripts/tools.js", "scripts/background/messaging.js",
"scripts/extract.js", "scripts/background/result.js",
"scripts/extractor.js" "scripts/background/signiture.js",
"scripts/background/actions.js",
"scripts/background/extractor.js",
"scripts/background/helpers.js"
], ],
"persistent": false "persistent": false
}, },
"content_scripts": [{ "content_scripts": [{
"matches": ["*://*/*"], "matches": ["*://*/*"],
"js": [ "js": [
"scripts/jquery.min.js", "scripts/shared/tools.js",
"scripts/content.js" "scripts/shared/common.js",
"scripts/content/content.js"
], ],
"run_at": "document_idle" "run_at": "document_idle"
}], }],
"permissions": [ "permissions": [
"activeTab", "activeTab"
"storage"
] ]
} }

View File

@ -3,11 +3,9 @@
<link> <link>
<meta charset="utf-8"> <meta charset="utf-8">
<title>Data Extractor</title> <title>Data Extractor</title>
<script charset="UTF-8" type="text/javascript" src="../scripts/jquery.min.js"></script> <script charset="UTF-8" type="text/javascript" src="tip.js"></script>
<script charset="UTF-8" type="text/javascript" src="../styles/bootstrap.min.js"></script>
<script charset="UTF-8" type="text/javascript" src="./tip.js"></script>
<link rel="stylesheet" href="../styles/bootstrap.min.css"> <link rel="stylesheet" href="styles/bootstrap.min.css">
</head> </head>
<body style="margin: 20px 10px;"> <body style="margin: 20px 10px;">
@ -26,7 +24,8 @@
and type your scripts in the console. and type your scripts in the console.
</p> </p>
<p> <p>
<img src="../images/console.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px"> <img src="../images/console.png" alt=""
style="max-width: 489px; width: 100%; border-radius: 5px">
</p> </p>
</div> </div>

View File

@ -1,14 +1,14 @@
$().ready( window.onload = function () {
() => { document.querySelector('#link-extension-detail')
$("#link-extension-detail").on('click', () => { .addEventListener('click', () => {
chrome.tabs.create({ chrome.tabs.create({
'url': `chrome://extensions/?id=${chrome.runtime.id}` 'url': `chrome://extensions/?id=${chrome.runtime.id}`
}); });
}) })
$("#link-document").on('click', () => { document.querySelector('#link-document')
.addEventListener('click', () => {
chrome.tabs.create({ chrome.tabs.create({
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion` 'url': `https://git.jebbs.co/jebbs/data-extracter-extesion`
}); });
}) })
} }
);

View File

@ -5,7 +5,7 @@ DataExtracter helps you quickly extract data from any web pages.
All you need to do is: All you need to do is:
- Find out the selectors (JQuery selectors) for target data - Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow. - Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png) ![](images/console.png)
@ -14,40 +14,40 @@ All you need to do is:
Extract current page Extract current page
```js ```js
new Extractor().task(".list-item", ["a.title", "p.content"]).start(); $('.item', ['a', 'a@href']);
``` ```
Extract multiple pages (1-10, interval 1) Extract multiple pages (1-10, interval 1)
```js ```js
new Extractor().task(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=${page}", 1, 10, 1).start(); $('.item', ['a', 'a@href'],"http://sample.com/?pn=${page}", 1, 10, 1);
``` ```
Extract multiple urls (list) Extract multiple urls (list)
```js ```js
new Extractor().task(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"]).start(); $('.item', ['a', 'a@href'],["http://sample.com/abc","http://sample.com/xyz"]);
``` ```
Extract specified pages (1,3,5) Extract specified pages (1,3,5)
```js ```js
new Extractor().task(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", [1, 3, 5]).start(); $('.item', ['a', 'a@href'], "http://sample.com/?pn=${page}", [1, 3, 5]);
``` ```
## Extractor.task() Signitures ## Task Call Signitures
```ts ```ts
// a task extracting data from current page // extract data from current page
task(itemsSelector:string, fieldSelectors:string[]) function (itemsSelector:string, fieldSelectors:string[])
// a task extracting data from a range of pages // extract data from a range of pages
task(itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number) function (itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
// a task extracting data from a list of pages // extract data from a list of pages
task(itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[]) function (itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
// a task extracting data from a list of pages // extract data from a list of pages
task(itemsSelector:string, fieldSelectors:string[], urls:string[]) function (itemsSelector:string, fieldSelectors:string[], urls:string[])
// a task extracting data of urls which extracted from last task result // extract data of urls which extracted from last task result
task(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult) function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
``` ```
## Advanced Usage ## Advanced Usage
@ -65,7 +65,7 @@ The only way to stop tasks before its finish, is `Closing the host tab`.
e.g.: link text and target (use 'selector@attribute') e.g.: link text and target (use 'selector@attribute')
```js ```js
new Extractor().task('.list-item', ['a.title', 'a.title@href']).start(); new Extractor().task('.item', ['a', 'a@href']).start();
``` ```
### Use Task Chain. ### Use Task Chain.
@ -74,7 +74,7 @@ e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each lin
```js ```js
new Extractor() new Extractor()
.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"]) .task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]) .task('list-item', ["a.title", "p.content"])
.start(); .start();
``` ```
@ -85,7 +85,7 @@ To a multiple task (chain) Extractor `e`:
```js ```js
e = new Extractor() e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"]) e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]) .task('list-item', ["a.title", "p.content"])
.start(); .start();
``` ```
@ -114,7 +114,7 @@ Here we have 2 tasks:
```js ```js
e = new Extractor() e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"]) e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]) .task('list-item', ["a.title", "p.content"])
.start(); .start();
``` ```

View File

@ -1,14 +0,0 @@
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (message.from === "DataExtracter:Extract") {
if (!testArgs(...message.args)) {
sendResponse(signitures);
return;
}
extract(...message.args).catch(
err => {
console.log(err);
alert(err);
}
);
}
});

View File

@ -1,18 +1,3 @@
/**
* Extract data from current tab / multiple urls.
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
*/
async function extract(itemsSelector, fieldSelectors, ...args) {
let result = await getData(itemsSelector, fieldSelectors, ...args);
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
/** /**
* Extract data from current page / multiple urls. * Extract data from current page / multiple urls.
* getData(tab, itemsSelector:string, fieldSelectors:string[]) * getData(tab, itemsSelector:string, fieldSelectors:string[])
@ -43,7 +28,7 @@ async function getData(...args) {
if (urls.length) { if (urls.length) {
pms = urls.reduce((p, url) => p.then( pms = urls.reduce((p, url) => p.then(
results => { results => {
data.push(...results); if (results) data.push(...results);
return redirectTab(tab, url).then( return redirectTab(tab, url).then(
() => extractTabData(tab, itemsSelector, fieldSelectors) () => extractTabData(tab, itemsSelector, fieldSelectors)
); );
@ -55,7 +40,8 @@ async function getData(...args) {
} }
pms.then( pms.then(
results => { results => {
data.push(...results); if (results) data.push(...results);
data.unshift(fieldSelectors);
resolve(new ExtractResult(data)); resolve(new ExtractResult(data));
}, },
err => reject(err) err => reject(err)
@ -97,7 +83,7 @@ function redirectTab(tab, url) {
if (url !== u) { if (url !== u) {
curUrl = u; curUrl = u;
let req = { let req = {
from: "GotoUrl", action: ACTION_GOTO_URL,
url: url url: url
} }
sendMessage(tab, req, `Goto url: ${url}`); sendMessage(tab, req, `Goto url: ${url}`);
@ -115,11 +101,11 @@ function redirectTab(tab, url) {
*/ */
function extractTabData(tab, itemsSelector, fieldSelectors) { function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = { let req = {
from: "Extract", action: ACTION_EXTRACT,
itemsSelector: itemsSelector, itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors fieldSelectors: fieldSelectors
} }
let cond = r => r && r.length; let cond = r => !MSG_ELEMENT_NOT_FOUND.isEqual(r);
return sendMessage(tab, req, 'Extract data from the tab...', cond); return sendMessage(tab, req, 'Extract data from the tab...', cond);
} }
@ -130,9 +116,9 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
*/ */
function reportIn(tab) { function reportIn(tab) {
let req = { let req = {
from: "ReportIn" action: ACTION_REPORT_IN
} }
let cond = r => r == req.from; let cond = r => r == req.action;
return sendMessage(tab, req, 'Check tab availability...', cond); return sendMessage(tab, req, 'Check tab availability...', cond);
} }
@ -144,51 +130,12 @@ function reportIn(tab) {
*/ */
function queryUrl(tab, urlExcluded, log) { function queryUrl(tab, urlExcluded, log) {
let req = { let req = {
from: "QueryUrl" action: ACTION_QUERY_URL
} }
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url)); let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
return sendMessage(tab, req, log, cond); return sendMessage(tab, req, log, cond);
} }
/**
* Repeatedly sending a message to target tab until the response is detected good.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval) {
req.from = "DataExtracter:" + req.from;
interval = interval || 500;
return new Promise((resolve, reject) => {
loop();
async function loop() {
// console.log("request for", req.from);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
chrome.tabs.sendMessage(tab.id, req, r => {
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(r);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
async function createTab(url, active) { async function createTab(url, active) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.create({ chrome.tabs.create({

View File

@ -14,9 +14,8 @@ class Extractor {
*/ */
task(...args) { task(...args) {
if (!testArgs(...args)) { if (!testArgs(...args)) {
console.log(`Invalid call arguments.\n\n${argsToString(...args)}\n${signitures}\n`); console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
// break call chain to avoid unexpected task running return this;
return undefined;
} }
// given >2 arguments means the task specifies target page, // given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list. // so it won't accept last task result as url list.
@ -31,6 +30,7 @@ class Extractor {
clear() { clear() {
this._tasks = []; this._tasks = [];
this._results = []; this._results = [];
return this;
} }
/** /**
* Start the task chain. * Start the task chain.
@ -64,6 +64,7 @@ class Extractor {
result => { result => {
this._results[this._tasks[this._tasks.length - 1]] = result; this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false; this._running = false;
console.log("Tasks are all done.")
this.save(); this.save();
} }
).catch(err => { ).catch(err => {
@ -119,12 +120,19 @@ class Extractor {
if (!taskid) return; if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]]; const result = this._results[this._tasks[taskid - 1]];
if (!result) { if (!result) {
console.log(`No task result for id (${taskid}). Forget to call ".start()"?`); console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return; return;
} }
if (confirm( if (result.data.length <= 1) { // 1 for selector headers
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}` console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
)) { return;
}
let msg = `
Please confirm to download (${result.data.length - 1} items)
${result.toString(50) || "- Empty -"}
`.trim();
if (confirm(msg)) {
saveFile(result, "text/csv"); saveFile(result, "text/csv");
} }
} }

View File

@ -0,0 +1,3 @@
function $(...args) {
return new Extractor().task(...args).start();
}

View File

@ -0,0 +1,50 @@
/**
* Repeatedly sending a message to target tab until the response is detected good.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval) {
interval = interval || 500;
return new Promise((resolve, reject) => {
loop();
async function loop() {
// console.log("request for", req.action);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
chrome.tabs.sendMessage(tab.id, req, r => {
if (chrome.runtime.lastError) {
reject(chrome.runtime.lastError.message);
return;
}
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(r);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (!message.action || !message.action.startsWith(EXT_NAME)) {
return;
}
sendResponse("Calling from user pages is not allowed.");
return;
});

View File

@ -1,53 +1,26 @@
const signitures = ` const signitures = `
## Usage ## Usage
new Extractor().task(...args).task(...args).start(); // single task
$(...args);
// managed task chains
e = new Extractor();
e.task(...args).task(...args).start();
## Extractor.task() Signitures: ## Task Call Signitures:
---------------------------- function(itemsSelector:string, fieldSelectors:string[]);
function(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number);
function(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[]);
function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
task(itemsSelector:string, fieldSelectors:string[]) ## Example:
task(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number) // extract all links text & url under '.item' elements
task(itemsSelector:string, fieldSelectors:string, url:string, pages:number[]) // use 'selector@attr' to get attribute of the field elements
task(itemsSelector:string, fieldSelectors:string[], urls:string[]) $(".item", ["a", "a@href"]);
task(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## See Detailed Help: ## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion https://git.jebbs.co/jebbs/data-extracter-extesion
`.trim(); `.trim();
function saveFile(data, mimeType, fileName) {
fileName = fileName || document.title || "result";
var blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.BlobBuilder || window.MozBlobBuilder || window.WebKitBlobBuilder || window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}
function testArgs(...args) { function testArgs(...args) {
switch (args.length) { switch (args.length) {
case 0, 1: case 0, 1:

View File

@ -1,58 +0,0 @@
function extract(...args) {
let message = {
from: "DataExtracter:Extract",
args: args
}
chrome.runtime.sendMessage(message, r => {
if (r) console.log(r);
});
}
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.from) return;
let [ext, act] = request.from.split(":");
if (ext.toLowerCase() !== 'dataextracter') return;
// console.log(request);
switch (act.toLowerCase()) {
case "extract":
let data = extractTabData(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data);
break;
case "gotourl":
window.location.replace(request.url);
if (sendResponse) sendResponse(request.url);
break;
case "reportin":
if (sendResponse) sendResponse(request.from);
break;
case "queryurl":
if (sendResponse) sendResponse(window.location.href);
break;
default:
break;
}
}
);
function extractTabData(itemsSelector, fieldSelectors) {
let fieldNotFound = false;
let results = $(itemsSelector).toArray().map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
// TODO: close tab to cancel task tip
if (fieldNotFound) return;
let fieldVals = $(item).find(cls).toArray();
if (!fieldVals.length) {
fieldNotFound = true;
return;
}
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
return fieldNotFound ? [] : results
}

View File

@ -0,0 +1,53 @@
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
// console.log("Recieved request:",request);
switch (request.action) {
case ACTION_EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data);
break;
case ACTION_GOTO_URL:
window.location.replace(request.url);
if (sendResponse) sendResponse(request.url);
break;
case ACTION_REPORT_IN:
if (sendResponse) sendResponse(request.action);
break;
case ACTION_QUERY_URL:
if (sendResponse) sendResponse(window.location.href);
break;
default:
break;
}
}
);
function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound = {};
let items = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
let results = items.map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls));
if (!fieldVals.length) {
return;
}
fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
// if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
}

File diff suppressed because one or more lines are too long

8
scripts/shared/common.js Normal file
View File

@ -0,0 +1,8 @@
const EXT_NAME = "DataExtracter";
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");

42
scripts/shared/tools.js Normal file
View File

@ -0,0 +1,42 @@
class ConstMessage {
constructor(id, message) {
this.id = id;
this.message = message;
}
isEqual(err) {
if (!err || !err.id) return false;
return this.id == err.id;
}
}
function saveFile(data, mimeType, fileName) {
fileName = fileName || document.title || "result";
var blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.BlobBuilder || window.MozBlobBuilder || window.WebKitBlobBuilder || window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}

File diff suppressed because one or more lines are too long