Compare commits
8 Commits
667bb49e0d
...
33945e49ac
| Author | SHA1 | Date | |
|---|---|---|---|
| 33945e49ac | |||
| e9270e22b2 | |||
| 1148ae79d6 | |||
| fb2bb7b59e | |||
| c3b765ca8e | |||
| 97c23961d2 | |||
| fa46dc269f | |||
| b7aaa2b7f3 |
@ -16,9 +16,10 @@
|
||||
},
|
||||
"background": {
|
||||
"scripts": [
|
||||
"scripts/background.js",
|
||||
"scripts/result.js",
|
||||
"scripts/tools.js",
|
||||
"scripts/extract.js",
|
||||
"scripts/background.js"
|
||||
"scripts/extract.js"
|
||||
],
|
||||
"persistent": false
|
||||
},
|
||||
@ -26,7 +27,6 @@
|
||||
"matches": ["*://*/*"],
|
||||
"js": [
|
||||
"scripts/jquery.min.js",
|
||||
"scripts/tools.js",
|
||||
"scripts/content.js"
|
||||
],
|
||||
"run_at": "document_idle"
|
||||
|
||||
@ -1,9 +1,14 @@
|
||||
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
|
||||
if (message.from === "DataExtracter:Extract")
|
||||
if (message.from === "DataExtracter:Extract") {
|
||||
if (!testArgs(...message.args)) {
|
||||
sendResponse(signitures);
|
||||
return;
|
||||
}
|
||||
extract(...message.args).catch(
|
||||
err => {
|
||||
console.log(err);
|
||||
alert(err);
|
||||
}
|
||||
);
|
||||
}
|
||||
});
|
||||
@ -1,19 +1,32 @@
|
||||
chrome.runtime.onMessage.addListener(
|
||||
function extract(...args) {
|
||||
let message = {
|
||||
from: "DataExtracter:Extract",
|
||||
args: args
|
||||
}
|
||||
chrome.runtime.sendMessage(message, r => {
|
||||
if (r) console.log(r);
|
||||
});
|
||||
}
|
||||
|
||||
chrome.runtime.onMessage.addListener(
|
||||
function (request, sender, sendResponse) {
|
||||
if (!request.from) return;
|
||||
let [ext, act] = request.from.split(":");
|
||||
if (ext.toLowerCase() !== 'dataextracter') return;
|
||||
// console.log(request);
|
||||
switch (request.from) {
|
||||
case "DataExtracter:Extract":
|
||||
let data = extractData(request.itemsSelector, request.fieldSelectors);
|
||||
switch (act.toLowerCase()) {
|
||||
case "extract":
|
||||
let data = extractTabData(request.itemsSelector, request.fieldSelectors);
|
||||
if (sendResponse) sendResponse(data);
|
||||
break;
|
||||
case "DataExtracter:GotoUrl":
|
||||
case "gotourl":
|
||||
window.location.replace(request.url);
|
||||
if (sendResponse) sendResponse(request.url);
|
||||
break;
|
||||
case "DataExtracter:ReportIn":
|
||||
case "reportin":
|
||||
if (sendResponse) sendResponse(request.from);
|
||||
break;
|
||||
case "DataExtracter:QueryUrl":
|
||||
case "queryurl":
|
||||
if (sendResponse) sendResponse(window.location.href);
|
||||
break;
|
||||
default:
|
||||
@ -22,7 +35,7 @@
|
||||
}
|
||||
);
|
||||
|
||||
function extractData(itemsSelector, fieldSelectors) {
|
||||
function extractTabData(itemsSelector, fieldSelectors) {
|
||||
return $(itemsSelector).toArray().map(
|
||||
item => fieldSelectors.map(
|
||||
selector => {
|
||||
@ -31,65 +44,4 @@ function extractData(itemsSelector, fieldSelectors) {
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function extract(...args) {
|
||||
let sig = `
|
||||
# DataExtracter Help
|
||||
----------------------------
|
||||
|
||||
## Signitures:
|
||||
----------------------------
|
||||
|
||||
function extract(itemsSelector:string, fieldSelectors:string[])
|
||||
function extract(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
||||
function extract(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
||||
|
||||
## Examples:
|
||||
----------------------------
|
||||
|
||||
### Extract current page
|
||||
extract(".list-item", ["a.title", "p.content"])
|
||||
|
||||
### Extract multiple pages (1-10, interval 1)
|
||||
extract(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=\${page}", 1, 10, 1)
|
||||
|
||||
### Extract specified pages (1,3,5)
|
||||
extract(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=\${page}", [1, 3, 5])
|
||||
|
||||
## Advanced Examples:
|
||||
----------------------------
|
||||
|
||||
### Extract link text and target (use 'selector@attribute')
|
||||
extract('.list-item', ['a.title', 'a.title@href'])
|
||||
`.trim();
|
||||
if (!testArgs(...args)) {
|
||||
console.log(sig);
|
||||
return;
|
||||
}
|
||||
if (args.length == 2) {
|
||||
saveFileAsk(extractData(args[0], args[1]));
|
||||
return;
|
||||
}
|
||||
let message = {
|
||||
from: "DataExtracter:Extract",
|
||||
args: args
|
||||
}
|
||||
chrome.runtime.sendMessage(message, r => {
|
||||
if (r) {
|
||||
console.log(r);
|
||||
alert(r);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function testArgs(...args) {
|
||||
if (args.length < 2) return false;
|
||||
if (args.length == 2)
|
||||
return (args[0] && args[1] && (typeof args[0] == "string") && (args[1] instanceof Array))
|
||||
let urls = [];
|
||||
if (args.length > 2) return (typeof args[2] == "string") && (
|
||||
(args[3] instanceof Array) ||
|
||||
(!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]))
|
||||
)
|
||||
}
|
||||
@ -2,54 +2,74 @@
|
||||
* Extract data from current tab / multiple urls.
|
||||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||||
* @param {string} url url template to generate urls by filling with page numers.
|
||||
* @param {...number} args page numers, either [from, to, interval] or [...pages]
|
||||
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
|
||||
*/
|
||||
function extract(itemsSelector, fieldSelectors, url, ...args) {
|
||||
async function extract(itemsSelector, fieldSelectors, ...args) {
|
||||
let result = await getData(itemsSelector, fieldSelectors, ...args);
|
||||
if (confirm(
|
||||
`Click confirm to download if the sample data looks good (${result.data.length} items):\n\n${result.toString(50) || "- Empty -"}`
|
||||
)) {
|
||||
saveFile(result, "text/csv");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract data from current tab / multiple urls.
|
||||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||||
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
|
||||
*/
|
||||
async function getData(itemsSelector, fieldSelectors, ...args) {
|
||||
if (!testArgs(itemsSelector, fieldSelectors, ...args))
|
||||
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
||||
let urls = [];
|
||||
if (url) {
|
||||
if (args[0] instanceof Array) {
|
||||
urls = args[0].map(p => url.replace("${page}", p));
|
||||
} else if (args.length >= 3) {
|
||||
let from = args.shift();
|
||||
let to = args.shift();
|
||||
let interval = args.shift();
|
||||
for (let i = from; i <= to; i += interval) {
|
||||
urls.push(url.replace("${page}", i));
|
||||
if (args.length) {
|
||||
let arg = args.shift();
|
||||
if (arg instanceof Array) {
|
||||
urls = arg;
|
||||
} else if (arg instanceof ExractResult) {
|
||||
urls = arg.column(0);
|
||||
} else {
|
||||
let urlTempl = arg;
|
||||
if (urlTempl) {
|
||||
if (args[0] instanceof Array) {
|
||||
urls = args[0].map(p => urlTempl.replace("${page}", p));
|
||||
} else if (args.length >= 3) {
|
||||
let from = args.shift();
|
||||
let to = args.shift();
|
||||
let interval = args.shift();
|
||||
for (let i = from; i <= to; i += interval) {
|
||||
urls.push(urlTempl.replace("${page}", i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let data = [];
|
||||
let tab = await getActiveTab(true) || await getActiveTab(false);
|
||||
if (!tab) throw new Error("Cannot find active tab.");
|
||||
return new Promise((resolve, reject) => {
|
||||
chrome.tabs.query({
|
||||
active: true,
|
||||
currentWindow: true
|
||||
}, function (tabs) {
|
||||
let pms;
|
||||
let tab = tabs[0];
|
||||
if (urls.length) {
|
||||
pms = urls.reduce((p, url) => p.then(
|
||||
results => {
|
||||
data.push(...results);
|
||||
return redirectTab(tab, url).then(
|
||||
() => extractData(tab, itemsSelector, fieldSelectors)
|
||||
);
|
||||
},
|
||||
() => p
|
||||
), Promise.resolve([]));
|
||||
} else {
|
||||
pms = extractData(tab, itemsSelector, fieldSelectors);
|
||||
}
|
||||
pms.then(
|
||||
let pms;
|
||||
if (urls.length) {
|
||||
pms = urls.reduce((p, url) => p.then(
|
||||
results => {
|
||||
data.push(...results);
|
||||
data.unshift(fieldSelectors);
|
||||
saveFileAsk(data);
|
||||
resolve("save done.")
|
||||
return redirectTab(tab, url).then(
|
||||
() => extractTabData(tab, itemsSelector, fieldSelectors)
|
||||
);
|
||||
},
|
||||
err => reject(err)
|
||||
);
|
||||
});
|
||||
() => p
|
||||
), Promise.resolve([]));
|
||||
} else {
|
||||
pms = extractTabData(tab, itemsSelector, fieldSelectors);
|
||||
}
|
||||
pms.then(
|
||||
results => {
|
||||
data.push(...results);
|
||||
resolve(new ExractResult(data));
|
||||
},
|
||||
err => reject(err)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
@ -60,10 +80,10 @@ function redirectTab(tab, url) {
|
||||
if (url !== u) {
|
||||
curUrl = u;
|
||||
let req = {
|
||||
from: "DataExtracter:GotoUrl",
|
||||
from: "GotoUrl",
|
||||
url: url
|
||||
}
|
||||
chrome.tabs.sendMessage(tab.id, req);
|
||||
sendMessage(tab, req);
|
||||
}
|
||||
})
|
||||
.then(() => queryUrl(tab, curUrl))
|
||||
@ -71,21 +91,20 @@ function redirectTab(tab, url) {
|
||||
}
|
||||
|
||||
/**
|
||||
* extract data in from the target tab, usually used to detect if the content script is ready.
|
||||
* extract data in from the target tab.
|
||||
* @param {any} tab target tab
|
||||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||||
* @returns {Promise<string[]>} a promise of extracted data
|
||||
*/
|
||||
function extractData(tab, itemsSelector, fieldSelectors) {
|
||||
function extractTabData(tab, itemsSelector, fieldSelectors) {
|
||||
let req = {
|
||||
from: "DataExtracter:Extract",
|
||||
from: "Extract",
|
||||
itemsSelector: itemsSelector,
|
||||
fieldSelectors: fieldSelectors
|
||||
}
|
||||
let failMsg = "extractTabData failed after 10 second.";
|
||||
let cond = r => !!r;
|
||||
return sendMessageAndDetect(tab, req, cond, failMsg);
|
||||
return sendMessage(tab, req, cond);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -95,11 +114,10 @@ function extractData(tab, itemsSelector, fieldSelectors) {
|
||||
*/
|
||||
function reportIn(tab) {
|
||||
let req = {
|
||||
from: "DataExtracter:ReportIn"
|
||||
from: "ReportIn"
|
||||
}
|
||||
let failMsg = "reportIn failed after 10 second.";
|
||||
let cond = r => r == req.from;
|
||||
return sendMessageAndDetect(tab, req, cond, failMsg);
|
||||
return sendMessage(tab, req, cond);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -110,11 +128,10 @@ function reportIn(tab) {
|
||||
*/
|
||||
function queryUrl(tab, urlExcluded) {
|
||||
let req = {
|
||||
from: "DataExtracter:QueryUrl"
|
||||
from: "QueryUrl"
|
||||
}
|
||||
let failMsg = "queryUrl failed after 10 second.";
|
||||
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
|
||||
return sendMessageAndDetect(tab, req, cond, failMsg);
|
||||
return sendMessage(tab, req, cond);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -122,27 +139,27 @@ function queryUrl(tab, urlExcluded) {
|
||||
* @param {object} tab the table where to send the message
|
||||
* @param {object} req the request data.
|
||||
* @param {function} cond success condition function, r:any=>boolean
|
||||
* @param {string} failMsg message when failed after time out
|
||||
* @param {number} failedTimeOut fail time out
|
||||
* @param {number} detectInterval interval for detecting
|
||||
* @return {Promise} a promise of the response.
|
||||
*/
|
||||
function sendMessageAndDetect(tab, req, cond, failMsg, failedTimeOut, detectInterval) {
|
||||
function sendMessage(tab, req, cond, failedTimeOut, detectInterval) {
|
||||
req.from = "DataExtracter:" + req.from;
|
||||
failedTimeOut = failedTimeOut || 10000;
|
||||
detectInterval = detectInterval || 500;
|
||||
return new Promise((resolve, reject) => {
|
||||
let timeOut;
|
||||
let rejectTimeout = setTimeout(() => {
|
||||
reject(failMsg);
|
||||
reject(`${req.from} failed after ${failedTimeOut/1000} seconds.`);
|
||||
clearTimeout(timeOut);
|
||||
}, failedTimeOut);
|
||||
loop();
|
||||
|
||||
function loop() {
|
||||
chrome.tabs.sendMessage(tab.id, req, r => {
|
||||
if (cond(r)) {
|
||||
resolve(r);
|
||||
if (!cond || cond(r)) {
|
||||
clearTimeout(rejectTimeout);
|
||||
resolve(r);
|
||||
} else {
|
||||
timeOut = setTimeout(() => {
|
||||
loop();
|
||||
@ -151,4 +168,15 @@ function sendMessageAndDetect(tab, req, cond, failMsg, failedTimeOut, detectInte
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function getActiveTab(currentWindow) {
|
||||
return new Promise((resolve, reject) => {
|
||||
chrome.tabs.query({
|
||||
active: true,
|
||||
currentWindow: currentWindow
|
||||
}, function (tabs) {
|
||||
resolve(tabs[0]);
|
||||
})
|
||||
})
|
||||
}
|
||||
31
scripts/result.js
Normal file
31
scripts/result.js
Normal file
@ -0,0 +1,31 @@
|
||||
class ExractResult {
|
||||
constructor(data) {
|
||||
this._data = data || [];
|
||||
|
||||
}
|
||||
row(index) {
|
||||
return this._data[index];
|
||||
}
|
||||
column(index) {
|
||||
return [...new Array(this._data.length).keys()].map(
|
||||
i => this._data[i][index]
|
||||
);
|
||||
}
|
||||
get data() {
|
||||
return this._data;
|
||||
}
|
||||
toString(rowsCount) {
|
||||
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
|
||||
return data.slice().reduce(
|
||||
(csv, lineCells) => {
|
||||
let line = lineCells.reduce(
|
||||
(lineText, cell, idx) => {
|
||||
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
|
||||
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
|
||||
}, "");
|
||||
return csv + line + "\n";
|
||||
},
|
||||
""
|
||||
);
|
||||
}
|
||||
}
|
||||
101
scripts/tools.js
101
scripts/tools.js
@ -1,16 +1,43 @@
|
||||
function formatCSV(data) {
|
||||
return data.reduce(
|
||||
(csv, lineCells) => {
|
||||
let line = lineCells.reduce(
|
||||
(lineText, cell, idx) => {
|
||||
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
|
||||
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
|
||||
}, "");
|
||||
return csv + line + "\n";
|
||||
},
|
||||
""
|
||||
);
|
||||
}
|
||||
const signitures = `
|
||||
# DataExtracter Help
|
||||
----------------------------
|
||||
|
||||
## Signitures:
|
||||
----------------------------
|
||||
|
||||
function extract(itemsSelector:string, fieldSelectors:string[])
|
||||
function extract(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
||||
function extract(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
||||
function extract(itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||||
function extract(itemsSelector:string, fieldSelectors:string[], urls:ExractResult)
|
||||
|
||||
## Examples:
|
||||
----------------------------
|
||||
|
||||
### Extract current page
|
||||
extract(".list-item", ["a.title", "p.content"])
|
||||
|
||||
### Extract multiple pages (1-10, interval 1)
|
||||
extract(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=\${page}", 1, 10, 1)
|
||||
|
||||
### Extract multiple urls (list)
|
||||
extract(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"])
|
||||
|
||||
### Extract specified pages (1,3,5)
|
||||
extract(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=\${page}", [1, 3, 5])
|
||||
|
||||
## Advanced Examples:
|
||||
----------------------------
|
||||
|
||||
### Extract link text and target (use 'selector@attribute')
|
||||
extract('.list-item', ['a.title', 'a.title@href'])
|
||||
|
||||
### Collect links from page(s) & Extract data of each link
|
||||
>> (Available only in console of extension background page)
|
||||
|
||||
extract('body',["a.title", "p.content"], await getData('.list-item', ['.item a@href'],["http://sample.com/abc"]))
|
||||
`.trim();
|
||||
|
||||
|
||||
function saveFile(data, mimeType, fileName) {
|
||||
fileName = fileName || document.title || "result";
|
||||
@ -44,10 +71,48 @@ function saveFile(data, mimeType, fileName) {
|
||||
}
|
||||
}
|
||||
|
||||
function saveFileAsk(data) {
|
||||
let csv = formatCSV(data.slice(1, 50)).trim() || "- Empty -";
|
||||
if (confirm(`Click confirm to download if the sample data looks good (${data.length-1} items):\n\n${csv}`)) {
|
||||
csv = formatCSV(data);
|
||||
saveFile(csv, "text/csv");
|
||||
function testArgs(...args) {
|
||||
switch (args.length) {
|
||||
case 0, 1:
|
||||
return false;
|
||||
case 2:
|
||||
return args[0] && args[1] &&
|
||||
(typeof args[0] == "string") &&
|
||||
(args[1] instanceof Array) &&
|
||||
testArrayVals(args[1], v => typeof v == "string");
|
||||
case 3:
|
||||
return args[0] && args[1] &&
|
||||
typeof args[0] == "string" &&
|
||||
args[1] instanceof Array &&
|
||||
testArrayVals(args[1], v => typeof v == "string") &&
|
||||
(
|
||||
(
|
||||
args[2] instanceof Array &&
|
||||
testArrayVals(args[2], v => typeof v == "string")
|
||||
) || (
|
||||
args[2] instanceof ExractResult
|
||||
)
|
||||
);
|
||||
case 4:
|
||||
return args[0] && args[1] &&
|
||||
typeof args[0] == "string" &&
|
||||
args[1] instanceof Array &&
|
||||
testArrayVals(args[1], v => typeof v == "string") &&
|
||||
typeof args[2] == "string" &&
|
||||
args[3] instanceof Array &&
|
||||
testArrayVals(args[3], v => typeof v == "number");
|
||||
case 6:
|
||||
return args[0] && args[1] &&
|
||||
typeof args[0] == "string" &&
|
||||
args[1] instanceof Array &&
|
||||
testArrayVals(args[1], v => typeof v == "string") &&
|
||||
typeof args[2] == "string" &&
|
||||
!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
function testArrayVals(arr, tester) {
|
||||
return arr.reduce((p, c) => p && tester(c), true);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user