refactoring

This commit is contained in:
2020-01-10 12:07:21 +08:00
parent 51da68fee5
commit 26c6c1159e
15 changed files with 175 additions and 207 deletions

View File

@ -0,0 +1,168 @@
/**
* Extract data from current page / multiple urls.
* getData(tab, itemsSelector:string, fieldSelectors:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(tab, itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* getData(itemsSelector:string, fieldSelectors:string[])
* getData(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* @param {...any} args
*/
async function getData(...args) {
let tab;
if (typeof args[0] !== 'string') tab = args.shift();
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
itemsSelector = args.shift();
fieldSelectors = args.shift();
let urls = parseUrls(...args);
let data = [];
if (!tab) tab = await getActiveTab(true) || await getActiveTab(false);
if (!tab) throw new Error("Cannot find active tab.");
return new Promise((resolve, reject) => {
let pms;
if (urls.length) {
pms = urls.reduce((p, url) => p.then(
results => {
if (results) data.push(...results);
return redirectTab(tab, url).then(
() => extractTabData(tab, itemsSelector, fieldSelectors)
);
},
() => p
), Promise.resolve([]));
} else {
pms = extractTabData(tab, itemsSelector, fieldSelectors);
}
pms.then(
results => {
if (results) data.push(...results);
data.unshift(fieldSelectors);
resolve(new ExtractResult(data));
},
err => reject(err)
);
});
}
function parseUrls(...args) {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v);
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
function redirectTab(tab, url) {
let curUrl = "";
return queryUrl(tab, undefined, 'Query current url...')
.then(u => {
if (url !== u) {
curUrl = u;
let req = {
action: ACTION_GOTO_URL,
url: url
}
sendMessage(tab, req, `Goto url: ${url}`);
}
})
.then(() => queryUrl(tab, curUrl, 'Check if tab url matches expected...'))
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = {
action: ACTION_EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let cond = r => r !== undefined;
return sendMessage(tab, req, 'Extract data from the tab...', cond);
}
/**
* get report in from the target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the report in message
*/
function reportIn(tab) {
let req = {
action: ACTION_REPORT_IN
}
let cond = r => r == req.action;
return sendMessage(tab, req, 'Check tab availability...', cond);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab, urlExcluded, log) {
let req = {
action: ACTION_QUERY_URL
}
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
return sendMessage(tab, req, log, cond);
}
async function createTab(url, active) {
return new Promise((resolve, reject) => {
chrome.tabs.create({
'url': url,
'active': active
}, function (tab) {
resolve(tab);
})
})
}
async function getActiveTab(currentWindow) {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
async function getTabByID(id) {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

View File

@ -0,0 +1,143 @@
class Extractor {
constructor() {
this._tasks = [];
this._tab = undefined;
this._running = false;
this._results = {};
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args)) {
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
// break call chain to avoid unexpected task running
return this;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
return this;
}
/**
* Clear tasks and caches.
*/
clear() {
this._tasks = [];
this._results = [];
}
/**
* Start the task chain.
*/
async start() {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
if (!this._tasks.length) {
console.log('No task to run.');
return;
}
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) {
// task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
this._tab = await createTab(urls[0], false);
} else {
this._tab = await getActiveTab(false);
}
this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => {
return pms.then(
result => {
if (result === undefined) return getData(this._tab, ...args);
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
return;
}
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
_checkTaskId(id, defaultId) {
if (!this._tasks.length) {
console.log("No task found.");
return 0;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
return 0;
}
return id
}
}

View File

@ -0,0 +1,50 @@
/**
* Repeatedly sending a message to target tab until the response is detected good.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval) {
interval = interval || 500;
return new Promise((resolve, reject) => {
loop();
async function loop() {
// console.log("request for", req.action);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
chrome.tabs.sendMessage(tab.id, req, r => {
if (chrome.runtime.lastError) {
reject(chrome.runtime.lastError.message);
return;
}
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(r);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (!message.action || !message.action.startsWith(EXT_NAME)) {
return;
}
sendResponse("Calling from user pages is not allowed.");
return;
});

View File

@ -0,0 +1,34 @@
class ExtractResult {
constructor(data) {
this._data = data || [];
}
row(index) {
return this._data[index];
}
column(index) {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash() {
return this._data.reduce((p, c) => p.concat(c), []);
}
get data() {
return this._data;
}
toString(rowsCount) {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
}

View File

@ -0,0 +1,68 @@
const signitures = `
## Usage
new Extractor().task(...args).task(...args).start();
## Extractor.task() Signitures:
function(itemsSelector:string, fieldSelectors:string[])
function(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
function(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
function(itemsSelector:string, fieldSelectors:string[], urls:string[])
## Example:
// extract all links text & url under '.item' elements
// use 'selector@attr' to get attribute of the field elements
new Extractor().task(".item", ["a", "a@href"]).start();
## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion
`.trim();
function testArgs(...args) {
switch (args.length) {
case 0, 1:
return false;
case 2:
return args[0] && args[1] &&
(typeof args[0] == "string") &&
(args[1] instanceof Array) &&
testArrayVals(args[1], v => typeof v == "string");
case 3:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
(
(
args[2] instanceof Array &&
testArrayVals(args[2], v => typeof v == "string")
) || (
args[2] instanceof ExtractResult
)
);
case 4:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
args[3] instanceof Array &&
testArrayVals(args[3], v => typeof v == "number");
case 6:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]);
default:
return false;
}
function testArrayVals(arr, tester) {
return arr.reduce((p, c) => p && tester(c), true);
}
}
function argsToString(...args) {
return args.map(v => (v instanceof Array ? `[${v.join(', ')}]` : v.toString())).join(', ');
}