219 lines
7.2 KiB
JavaScript
219 lines
7.2 KiB
JavaScript
/**
|
||
* Extract data from current tab / multiple urls.
|
||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
|
||
*/
|
||
async function extract(itemsSelector, fieldSelectors, ...args) {
|
||
let result = await getData(itemsSelector, fieldSelectors, ...args);
|
||
if (confirm(
|
||
`Click confirm to download if the sample data looks good (${result.data.length} items):\n\n${result.toString(50) || "- Empty -"}`
|
||
)) {
|
||
saveFile(result, "text/csv");
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract data from current page / multiple urls.
|
||
* getData(tab, itemsSelector:string, fieldSelectors:string[])
|
||
* getData(tab, itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
||
* getData(tab, itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
||
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
||
* getData(itemsSelector:string, fieldSelectors:string[])
|
||
* getData(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
||
* getData(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
||
* getData(itemsSelector:string, fieldSelectors:string[], urls:string[])
|
||
* getData(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
||
* @param {...any} args
|
||
*/
|
||
async function getData(...args) {
|
||
let tab;
|
||
if (typeof args[0] !== 'string') tab = args.shift();
|
||
if (!testArgs(...args))
|
||
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
||
itemsSelector = args.shift();
|
||
fieldSelectors = args.shift();
|
||
let urls = parseUrls(...args);
|
||
let data = [];
|
||
if (!tab) tab = await getActiveTab(true) || await getActiveTab(false);
|
||
if (!tab) throw new Error("Cannot find active tab.");
|
||
return new Promise((resolve, reject) => {
|
||
let pms;
|
||
if (urls.length) {
|
||
pms = urls.reduce((p, url) => p.then(
|
||
results => {
|
||
data.push(...results);
|
||
return redirectTab(tab, url).then(
|
||
() => extractTabData(tab, itemsSelector, fieldSelectors)
|
||
);
|
||
},
|
||
() => p
|
||
), Promise.resolve([]));
|
||
} else {
|
||
pms = extractTabData(tab, itemsSelector, fieldSelectors);
|
||
}
|
||
pms.then(
|
||
results => {
|
||
data.push(...results);
|
||
resolve(new ExtractResult(data));
|
||
},
|
||
err => reject(err)
|
||
);
|
||
});
|
||
}
|
||
|
||
function parseUrls(...args) {
|
||
if (!args.length) return [];
|
||
let arg = args.shift();
|
||
if (arg instanceof Array) {
|
||
return arg;
|
||
} else if (arg instanceof ExtractResult) {
|
||
return arg.squash().filter(v => !!v);
|
||
} else {
|
||
let urlTempl = arg;
|
||
if (urlTempl) {
|
||
if (args[0] instanceof Array) {
|
||
return args[0].map(p => urlTempl.replace("${page}", p));
|
||
} else if (args.length >= 3) {
|
||
let urls = [];
|
||
let from = args.shift();
|
||
let to = args.shift();
|
||
let interval = args.shift();
|
||
for (let i = from; i <= to; i += interval) {
|
||
urls.push(urlTempl.replace("${page}", i));
|
||
}
|
||
return urls;
|
||
}
|
||
}
|
||
}
|
||
return [];
|
||
}
|
||
|
||
function redirectTab(tab, url) {
|
||
let curUrl = "";
|
||
return queryUrl(tab)
|
||
.then(u => {
|
||
if (url !== u) {
|
||
curUrl = u;
|
||
let req = {
|
||
from: "GotoUrl",
|
||
url: url
|
||
}
|
||
sendMessage(tab, req);
|
||
}
|
||
})
|
||
.then(() => queryUrl(tab, curUrl))
|
||
.then(() => reportIn(tab));
|
||
}
|
||
|
||
/**
|
||
* extract data in from the target tab.
|
||
* @param {any} tab target tab
|
||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||
* @returns {Promise<string[]>} a promise of extracted data
|
||
*/
|
||
function extractTabData(tab, itemsSelector, fieldSelectors) {
|
||
let req = {
|
||
from: "Extract",
|
||
itemsSelector: itemsSelector,
|
||
fieldSelectors: fieldSelectors
|
||
}
|
||
let cond = r => r && r.length;
|
||
return sendMessage(tab, req, cond);
|
||
}
|
||
|
||
/**
|
||
* get report in from the target tab, usually used to detect if the content script is ready.
|
||
* @param {any} tab target tab
|
||
* @returns {Promise<string>} a promise of the report in message
|
||
*/
|
||
function reportIn(tab) {
|
||
let req = {
|
||
from: "ReportIn"
|
||
}
|
||
let cond = r => r == req.from;
|
||
return sendMessage(tab, req, cond);
|
||
}
|
||
|
||
/**
|
||
* get the url of the target tab
|
||
* @param {any} tab target tab
|
||
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded
|
||
* @returns {Promise<string>} a promise of the url
|
||
*/
|
||
function queryUrl(tab, urlExcluded) {
|
||
let req = {
|
||
from: "QueryUrl"
|
||
}
|
||
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
|
||
return sendMessage(tab, req, cond);
|
||
}
|
||
|
||
/**
|
||
* Repeatedly sending a message to target tab until the response is detected good.
|
||
* @param {object} tab the table where to send the message
|
||
* @param {object} req the request data.
|
||
* @param {function} cond success condition function, r:any=>boolean
|
||
* @param {number} interval interval for detecting
|
||
* @return {Promise} a promise of the response.
|
||
*/
|
||
function sendMessage(tab, req, cond, interval) {
|
||
req.from = "DataExtracter:" + req.from;
|
||
interval = interval || 500;
|
||
return new Promise((resolve, reject) => {
|
||
|
||
loop();
|
||
|
||
async function loop() {
|
||
console.log("request for", req.from);
|
||
let tabAvailable = await getTabByID(tab.id);
|
||
if (!tabAvailable) {
|
||
reject("Task interrupted due to the target tab is closed.");
|
||
return;
|
||
}
|
||
|
||
chrome.tabs.sendMessage(tab.id, req, r => {
|
||
if (!cond || cond(r)) {
|
||
resolve(r);
|
||
} else {
|
||
setTimeout(() => {
|
||
loop();
|
||
}, interval);
|
||
}
|
||
});
|
||
}
|
||
});
|
||
}
|
||
|
||
async function createTab(url, active) {
|
||
return new Promise((resolve, reject) => {
|
||
chrome.tabs.create({
|
||
'url': url,
|
||
'active': active
|
||
}, function (tab) {
|
||
resolve(tab);
|
||
})
|
||
})
|
||
}
|
||
|
||
async function getActiveTab(currentWindow) {
|
||
return new Promise((resolve, reject) => {
|
||
chrome.tabs.query({
|
||
active: true,
|
||
currentWindow: currentWindow
|
||
}, function (tabs) {
|
||
resolve(tabs[0]);
|
||
})
|
||
})
|
||
}
|
||
|
||
async function getTabByID(id) {
|
||
return new Promise((resolve, reject) => {
|
||
chrome.tabs.get(id, function (tab) {
|
||
chrome.runtime.lastError;
|
||
resolve(tab);
|
||
})
|
||
})
|
||
} |