Compare commits

...

8 Commits

Author SHA1 Message Date
33945e49ac code optimize 2018-09-27 16:11:51 +08:00
e9270e22b2 fix & optimize 2018-09-27 14:42:08 +08:00
1148ae79d6 refactor 2018-09-27 13:30:45 +08:00
fb2bb7b59e code optimize 2018-09-27 11:57:21 +08:00
c3b765ca8e code optimize 2018-09-26 16:48:26 +08:00
97c23961d2 update signitures 2018-09-26 16:07:55 +08:00
fa46dc269f code optimize 2018-09-26 15:41:45 +08:00
b7aaa2b7f3 url list support 2018-09-26 14:27:01 +08:00
6 changed files with 228 additions and 147 deletions

View File

@ -16,9 +16,10 @@
},
"background": {
"scripts": [
"scripts/background.js",
"scripts/result.js",
"scripts/tools.js",
"scripts/extract.js",
"scripts/background.js"
"scripts/extract.js"
],
"persistent": false
},
@ -26,7 +27,6 @@
"matches": ["*://*/*"],
"js": [
"scripts/jquery.min.js",
"scripts/tools.js",
"scripts/content.js"
],
"run_at": "document_idle"

View File

@ -1,9 +1,14 @@
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (message.from === "DataExtracter:Extract")
if (message.from === "DataExtracter:Extract") {
if (!testArgs(...message.args)) {
sendResponse(signitures);
return;
}
extract(...message.args).catch(
err => {
console.log(err);
alert(err);
}
);
}
});

View File

@ -1,19 +1,32 @@
chrome.runtime.onMessage.addListener(
function extract(...args) {
let message = {
from: "DataExtracter:Extract",
args: args
}
chrome.runtime.sendMessage(message, r => {
if (r) console.log(r);
});
}
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.from) return;
let [ext, act] = request.from.split(":");
if (ext.toLowerCase() !== 'dataextracter') return;
// console.log(request);
switch (request.from) {
case "DataExtracter:Extract":
let data = extractData(request.itemsSelector, request.fieldSelectors);
switch (act.toLowerCase()) {
case "extract":
let data = extractTabData(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data);
break;
case "DataExtracter:GotoUrl":
case "gotourl":
window.location.replace(request.url);
if (sendResponse) sendResponse(request.url);
break;
case "DataExtracter:ReportIn":
case "reportin":
if (sendResponse) sendResponse(request.from);
break;
case "DataExtracter:QueryUrl":
case "queryurl":
if (sendResponse) sendResponse(window.location.href);
break;
default:
@ -22,7 +35,7 @@
}
);
function extractData(itemsSelector, fieldSelectors) {
function extractTabData(itemsSelector, fieldSelectors) {
return $(itemsSelector).toArray().map(
item => fieldSelectors.map(
selector => {
@ -31,65 +44,4 @@ function extractData(itemsSelector, fieldSelectors) {
}
)
);
}
function extract(...args) {
let sig = `
# DataExtracter Help
----------------------------
## Signitures:
----------------------------
function extract(itemsSelector:string, fieldSelectors:string[])
function extract(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
function extract(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
## Examples:
----------------------------
### Extract current page
extract(".list-item", ["a.title", "p.content"])
### Extract multiple pages (1-10, interval 1)
extract(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=\${page}", 1, 10, 1)
### Extract specified pages (1,3,5)
extract(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=\${page}", [1, 3, 5])
## Advanced Examples:
----------------------------
### Extract link text and target (use 'selector@attribute')
extract('.list-item', ['a.title', 'a.title@href'])
`.trim();
if (!testArgs(...args)) {
console.log(sig);
return;
}
if (args.length == 2) {
saveFileAsk(extractData(args[0], args[1]));
return;
}
let message = {
from: "DataExtracter:Extract",
args: args
}
chrome.runtime.sendMessage(message, r => {
if (r) {
console.log(r);
alert(r);
}
});
}
function testArgs(...args) {
if (args.length < 2) return false;
if (args.length == 2)
return (args[0] && args[1] && (typeof args[0] == "string") && (args[1] instanceof Array))
let urls = [];
if (args.length > 2) return (typeof args[2] == "string") && (
(args[3] instanceof Array) ||
(!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]))
)
}

View File

@ -2,54 +2,74 @@
* Extract data from current tab / multiple urls.
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @param {string} url url template to generate urls by filling with page numers.
* @param {...number} args page numers, either [from, to, interval] or [...pages]
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
*/
function extract(itemsSelector, fieldSelectors, url, ...args) {
async function extract(itemsSelector, fieldSelectors, ...args) {
let result = await getData(itemsSelector, fieldSelectors, ...args);
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
/**
* Extract data from current tab / multiple urls.
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
*/
async function getData(itemsSelector, fieldSelectors, ...args) {
if (!testArgs(itemsSelector, fieldSelectors, ...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
let urls = [];
if (url) {
if (args[0] instanceof Array) {
urls = args[0].map(p => url.replace("${page}", p));
} else if (args.length >= 3) {
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(url.replace("${page}", i));
if (args.length) {
let arg = args.shift();
if (arg instanceof Array) {
urls = arg;
} else if (arg instanceof ExractResult) {
urls = arg.column(0);
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
urls = args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
}
}
}
}
let data = [];
let tab = await getActiveTab(true) || await getActiveTab(false);
if (!tab) throw new Error("Cannot find active tab.");
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: true
}, function (tabs) {
let pms;
let tab = tabs[0];
if (urls.length) {
pms = urls.reduce((p, url) => p.then(
results => {
data.push(...results);
return redirectTab(tab, url).then(
() => extractData(tab, itemsSelector, fieldSelectors)
);
},
() => p
), Promise.resolve([]));
} else {
pms = extractData(tab, itemsSelector, fieldSelectors);
}
pms.then(
let pms;
if (urls.length) {
pms = urls.reduce((p, url) => p.then(
results => {
data.push(...results);
data.unshift(fieldSelectors);
saveFileAsk(data);
resolve("save done.")
return redirectTab(tab, url).then(
() => extractTabData(tab, itemsSelector, fieldSelectors)
);
},
err => reject(err)
);
});
() => p
), Promise.resolve([]));
} else {
pms = extractTabData(tab, itemsSelector, fieldSelectors);
}
pms.then(
results => {
data.push(...results);
resolve(new ExractResult(data));
},
err => reject(err)
);
});
}
@ -60,10 +80,10 @@ function redirectTab(tab, url) {
if (url !== u) {
curUrl = u;
let req = {
from: "DataExtracter:GotoUrl",
from: "GotoUrl",
url: url
}
chrome.tabs.sendMessage(tab.id, req);
sendMessage(tab, req);
}
})
.then(() => queryUrl(tab, curUrl))
@ -71,21 +91,20 @@ function redirectTab(tab, url) {
}
/**
* extract data in from the target tab, usually used to detect if the content script is ready.
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
function extractData(tab, itemsSelector, fieldSelectors) {
function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = {
from: "DataExtracter:Extract",
from: "Extract",
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let failMsg = "extractTabData failed after 10 second.";
let cond = r => !!r;
return sendMessageAndDetect(tab, req, cond, failMsg);
return sendMessage(tab, req, cond);
}
/**
@ -95,11 +114,10 @@ function extractData(tab, itemsSelector, fieldSelectors) {
*/
function reportIn(tab) {
let req = {
from: "DataExtracter:ReportIn"
from: "ReportIn"
}
let failMsg = "reportIn failed after 10 second.";
let cond = r => r == req.from;
return sendMessageAndDetect(tab, req, cond, failMsg);
return sendMessage(tab, req, cond);
}
/**
@ -110,11 +128,10 @@ function reportIn(tab) {
*/
function queryUrl(tab, urlExcluded) {
let req = {
from: "DataExtracter:QueryUrl"
from: "QueryUrl"
}
let failMsg = "queryUrl failed after 10 second.";
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
return sendMessageAndDetect(tab, req, cond, failMsg);
return sendMessage(tab, req, cond);
}
/**
@ -122,27 +139,27 @@ function queryUrl(tab, urlExcluded) {
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {string} failMsg message when failed after time out
* @param {number} failedTimeOut fail time out
* @param {number} detectInterval interval for detecting
* @return {Promise} a promise of the response.
*/
function sendMessageAndDetect(tab, req, cond, failMsg, failedTimeOut, detectInterval) {
function sendMessage(tab, req, cond, failedTimeOut, detectInterval) {
req.from = "DataExtracter:" + req.from;
failedTimeOut = failedTimeOut || 10000;
detectInterval = detectInterval || 500;
return new Promise((resolve, reject) => {
let timeOut;
let rejectTimeout = setTimeout(() => {
reject(failMsg);
reject(`${req.from} failed after ${failedTimeOut/1000} seconds.`);
clearTimeout(timeOut);
}, failedTimeOut);
loop();
function loop() {
chrome.tabs.sendMessage(tab.id, req, r => {
if (cond(r)) {
resolve(r);
if (!cond || cond(r)) {
clearTimeout(rejectTimeout);
resolve(r);
} else {
timeOut = setTimeout(() => {
loop();
@ -151,4 +168,15 @@ function sendMessageAndDetect(tab, req, cond, failMsg, failedTimeOut, detectInte
});
}
});
}
async function getActiveTab(currentWindow) {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}

31
scripts/result.js Normal file
View File

@ -0,0 +1,31 @@
class ExractResult {
constructor(data) {
this._data = data || [];
}
row(index) {
return this._data[index];
}
column(index) {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
get data() {
return this._data;
}
toString(rowsCount) {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
}

View File

@ -1,16 +1,43 @@
function formatCSV(data) {
return data.reduce(
(csv, lineCells) => {
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
const signitures = `
# DataExtracter Help
----------------------------
## Signitures:
----------------------------
function extract(itemsSelector:string, fieldSelectors:string[])
function extract(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
function extract(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
function extract(itemsSelector:string, fieldSelectors:string[], urls:string[])
function extract(itemsSelector:string, fieldSelectors:string[], urls:ExractResult)
## Examples:
----------------------------
### Extract current page
extract(".list-item", ["a.title", "p.content"])
### Extract multiple pages (1-10, interval 1)
extract(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=\${page}", 1, 10, 1)
### Extract multiple urls (list)
extract(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"])
### Extract specified pages (1,3,5)
extract(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=\${page}", [1, 3, 5])
## Advanced Examples:
----------------------------
### Extract link text and target (use 'selector@attribute')
extract('.list-item', ['a.title', 'a.title@href'])
### Collect links from page(s) & Extract data of each link
>> (Available only in console of extension background page)
extract('body',["a.title", "p.content"], await getData('.list-item', ['.item a@href'],["http://sample.com/abc"]))
`.trim();
function saveFile(data, mimeType, fileName) {
fileName = fileName || document.title || "result";
@ -44,10 +71,48 @@ function saveFile(data, mimeType, fileName) {
}
}
function saveFileAsk(data) {
let csv = formatCSV(data.slice(1, 50)).trim() || "- Empty -";
if (confirm(`Click confirm to download if the sample data looks good (${data.length-1} items)\n\n${csv}`)) {
csv = formatCSV(data);
saveFile(csv, "text/csv");
function testArgs(...args) {
switch (args.length) {
case 0, 1:
return false;
case 2:
return args[0] && args[1] &&
(typeof args[0] == "string") &&
(args[1] instanceof Array) &&
testArrayVals(args[1], v => typeof v == "string");
case 3:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
(
(
args[2] instanceof Array &&
testArrayVals(args[2], v => typeof v == "string")
) || (
args[2] instanceof ExractResult
)
);
case 4:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
args[3] instanceof Array &&
testArrayVals(args[3], v => typeof v == "number");
case 6:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]);
default:
return false;
}
function testArrayVals(arr, tester) {
return arr.reduce((p, c) => p && tester(c), true);
}
}