Compare commits

...

6 Commits

Author SHA1 Message Date
3d375261df fix task._data_keys not cleaned 2020-01-13 16:55:40 +08:00
13e233fbe7 allow user decision when some action fails 2020-01-13 16:47:52 +08:00
21d3dfb247 small fixes 2020-01-13 16:45:54 +08:00
97c8aac58d add logger 2020-01-13 14:27:40 +08:00
09112bb506 update documents 2020-01-12 16:54:24 +08:00
c7f4fe7cc4 save and load state 2020-01-12 16:19:38 +08:00
13 changed files with 312 additions and 125 deletions

View File

@ -1,7 +1,7 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.1.0",
"version": "0.5.0",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
@ -18,6 +18,7 @@
"scripts": [
"scripts/shared/tools.js",
"scripts/shared/common.js",
"scripts/background/logger.js",
"scripts/background/messaging.js",
"scripts/background/result.js",
"scripts/background/signiture.js",
@ -38,6 +39,7 @@
"run_at": "document_idle"
}],
"permissions": [
"activeTab"
"activeTab",
"notifications"
]
}

View File

@ -3,6 +3,7 @@
<link>
<meta charset="utf-8">
<title>Data Extractor</title>
<script charset="UTF-8" type="text/javascript" src="../scripts/shared/common.js"></script>
<script charset="UTF-8" type="text/javascript" src="tip.js"></script>
<link rel="stylesheet" href="styles/bootstrap.min.css">
@ -32,7 +33,6 @@
</div>
</div>
<div class="row">
<div class="col">
<h6>Quick Start</h6>
</div>
@ -42,22 +42,33 @@
<div class="alert alert-success small">
<p>
<b>Extract current page</b>:
<br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
<br>&gt; $(".list-item", ["a.title", "p.content"]);
</p>
<p>
<b>Extract multiple pages (1-10, interval 1)</b>:
<br>new Extractor().task(".list-item", ["a.title", "p.content"],
"http://sample.com/?pn=${page}", 1, 10, 1).start();
<br>&gt; job=new Extractor().task(".list-item", ["a.title", "p.content"],
"http://sample.com/?pn=${page}", 1, 10, 1);
<br>&gt; job.start();
</p>
<p>
<b>Full document:</b>
<b>Full document at:</b>
<br>
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
</p>
</div>
</div>
</div>
<div class="row">
<div class="col">
<h6>Saved State</h6>
</div>
</div>
<div class="row">
<div class="col">
<input type="file" name="state" id="state-input">
</div>
</div>
</div>
</body>

View File

@ -11,4 +11,22 @@ window.onload = function () {
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion`
});
})
document.querySelector('#state-input')
.addEventListener('change', function (...args) {
if (this.files.length == 1) {
var reader = new FileReader();
let fileName = this.files[0].name;
reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) {
var fileString = evt.target.result;
chrome.runtime.sendMessage({
action: ACTION_UPLOAD_STATE,
state: fileString,
name: fileName
}, r => {
if (r) console.log('State sent:', r);
});
}
}
});
}

View File

@ -78,8 +78,54 @@ e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.start();
```
### Extractor Options
Specify extra options, to make task do some actions before scrape the data.
```js
var job = new Extractor({ "scrollToBottom": 1 });
```
Available options:
- `scrollToBottom`: Try scroll pages to the bottom, some elements are loaded only we user need them.
### Export Result of Any Task
To a multiple task Extractor `e`:
```js
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
User will be asked to export the final result when it finishes.
Incase you want to export it again, use:
```js
e.export()
```
To export another task result, other than the final one:
```js
// export the result of first task
// to the example above, that is a list of urls
e.export(0)
// export the result of second task
e.export(1)
```
## Task Management
### Continue Tasks
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks (with following), even it stops in the middle of a task:
```js
@ -99,9 +145,11 @@ e.restart(0)
e.restart(1)
```
### Save Result of Any Task
### Save & Load State
To a multiple task Extractor `e`:
It may also be hard to finish tasks in even a single day, we need a way to save current state, and come back tommorow.
Create and run an extractor:
```js
e = new Extractor()
@ -110,20 +158,16 @@ e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.start();
```
User will be asked to save the final result when it finishes.
Incase you want to save it again, use:
Save the state:
```js
e.save()
e.save();
```
To save another task result, other than the final one:
Load the state:
Open the popup window, upload the saved state file. Then, and in the backgoud console:
```js
// save the result of first task
// to the example above, that is a list of urls
e.save(0)
// save the result of second task
e.save(1)
```
e = new Extractor().load();
```

View File

@ -1,54 +1,3 @@
/**
* Extract data from current page / multiple urls.
* getData(tab, itemsSelector:string, fieldSelectors:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(tab, itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* getData(itemsSelector:string, fieldSelectors:string[])
* getData(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* @param {...any} args
*/
async function getData(...args) {
let tab;
if (typeof args[0] !== 'string') tab = args.shift();
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
itemsSelector = args.shift();
fieldSelectors = args.shift();
let urls = parseUrls(...args);
let data = [];
if (!tab) tab = await getActiveTab(true) || await getActiveTab(false);
if (!tab) throw new Error("Cannot find active tab.");
return new Promise((resolve, reject) => {
let pms;
if (urls.length) {
pms = urls.reduce((p, url) => p.then(
results => {
if (results) data.push(...results);
return redirectTab(tab, url).then(
() => extractTabData(tab, itemsSelector, fieldSelectors)
);
},
() => p
), Promise.resolve([]));
} else {
pms = extractTabData(tab, itemsSelector, fieldSelectors);
}
pms.then(
results => {
if (results) data.push(...results);
data.unshift(fieldSelectors);
resolve(new ExtractResult(data));
},
err => reject(err)
);
});
}
function parseUrls(...args) {
if (!args.length) return [];
let arg = args.shift();
@ -77,19 +26,26 @@ function parseUrls(...args) {
}
function redirectTab(tab, url) {
let curUrl = "";
return queryUrl(tab, undefined, 'Query current url...')
.then(u => {
if (url !== u) {
curUrl = u;
let req = {
action: ACTION_GOTO_URL,
url: url
}
return sendMessage(tab, req, `Goto url: ${url}`);
return queryUrl(tab).then(u => {
if (url !== u) {
let req = {
action: ACTION_GOTO_URL,
url: url
}
})
.then(() => queryUrl(tab, url, 'Check if tab url matches expected...'))
let checker = async (url, err, tryCount) => {
let newURL = await queryUrl(tab).catch(() => { });
if (newURL == url) return url;
if (
tryCount % 5 == 0 &&
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
) {
return MSG_USER_ABORT;
}
return undefined;
}
return sendMessage(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
@ -105,8 +61,19 @@ function extractTabData(tab, itemsSelector, fieldSelectors) {
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let cond = r => !MSG_ELEMENT_NOT_FOUND.isEqual(r);
return sendMessage(tab, req, 'Extract data from the tab...', cond);
let checker = (result, err, tryCount) => {
if (MSG_ELEMENT_NOT_FOUND.isEqual(result)) {
if (tryCount % 20 == 0) {
if (confirm('No data found in current page. \n\nContinue to next page?')) {
return [];
}
} else {
return undefined;
}
}
return result;
};
return sendMessage(tab, req, 'Extract data from the tab...', checker);
}
/**
@ -118,23 +85,21 @@ async function ping(tab, count = 1) {
let req = {
action: ACTION_REPORT_IN
}
let cond = r => r == req.action;
let pong = await sendMessage(tab, req, 'Check tab availability...', cond, 1000, count).catch(() => { });
let checker = r => r == req.action ? req.action : undefined;
let pong = await sendMessage(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { });
return pong == ACTION_REPORT_IN;
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab, expected, log) {
function queryUrl(tab) {
let req = {
action: ACTION_QUERY_URL
}
let cond = url => url && (!expected || (expected && expected == url));
return sendMessage(tab, req, log, cond);
return sendMessage(tab, req);
}
/**

View File

@ -1,9 +1,31 @@
var __EXTRACTOR_STATE__ = "";
class Extractor {
constructor(options) {
this._tasks = [];
this._running = false;
this._options = options;
}
/**
* Save current state, in case we restore it later.
*/
save() {
saveFile(JSON.stringify(this), 'application/json', 'state.json');
}
/**
* Restore previous state by loading from saved state.
*/
load() {
if (!__EXTRACTOR_STATE__) {
logger.info('No state found. Please upload a saved state from the popup window first.');
return;
}
let state = JSON.parse(__EXTRACTOR_STATE__);
__EXTRACTOR_STATE__ = "";
this._options = state._options;
this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t));
return this;
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
@ -41,11 +63,11 @@ class Extractor {
}
async _startTasks(from) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
logger.info('The Extractor is running. Please wait..');
return;
}
if (!this._tasks.length) {
console.log('No task to run.');
logger.info('No task to run.');
return;
}
@ -58,7 +80,7 @@ class Extractor {
tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
console.log('Cannot contact with active tab.');
logger.error('Cannot contact with active tab.');
return;
}
}
@ -76,29 +98,27 @@ class Extractor {
}, Promise.resolve(undefined)).then(
() => {
this._running = false;
this.save();
this.export();
}
).catch(err => {
this._running = false;
console.log(err);
logger.error(err);
});
}
/**
* Save result of a task
* export result of a task to CSV
* @param {number} taskid which task id to save, begins with 0
*/
save(taskid) {
export(taskid) {
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return;
let results = this._tasks[id].results
results.unshift(this._tasks[id].fieldSelectors);
let exResults = new ExtractResult(results);
if (!results.length) {
console.log(`No result for task #${id}. Forget to call ".start()"?`);
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
results.unshift(this._tasks[id].fieldSelectors);
let exResults = new ExtractResult(results);
let msg = `
Please confirm to download (${results.length - 1} items)
@ -110,12 +130,12 @@ ${exResults.toString(50) || "- Empty -"}
}
_checkTaskId(id, defaultId) {
if (!this._tasks.length) {
console.log("No task found.");
logger.info("No task found.");
return -1;
}
if (!isNaN(defaultId) && id === undefined) id = defaultId;
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
logger.info(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return -1;
}
return id

View File

@ -0,0 +1,81 @@
const LOGGER_LEVEL = {
DEBUG: 1,
INFO: 2,
WARNING: 3,
ERROR: 4,
DISABLED: 100,
properties: {
1: { name: "debug", value: 1, prefix: "DEBUG" },
2: { name: "info", value: 2, prefix: "INFO" },
3: { name: "warning", value: 3, prefix: "WARN" },
4: { name: "error", value: 3, prefix: "ERROR" }
}
};
class Logger {
_notificationId = undefined;
_log_level = LOGGER_LEVEL.INFO;
_notify_level = LOGGER_LEVEL.ERROR;
constructor(logLevel, notifyLevel) {
if (logLevel) this._log_level = logLevel;
if (notifyLevel) this._notify_level = notifyLevel;
chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
}
get logLevel() {
return this._log_level;
}
set logLevel(val) {
this._log_level = val;
}
get notifyLevel() {
return this._notify_level;
}
set notifyLevel(val) {
this._notify_level = val;
}
log(level, loggerFn, ...msgs) {
if (level < this._log_level) return;
let time = new Date().toLocaleString();
loggerFn(`${time} [${LOGGER_LEVEL.properties[level].prefix}]`, ...msgs);
if (level < this._notify_level) return;
this.notify(...msgs);
}
debug(...msgs) {
this.log(LOGGER_LEVEL.DEBUG, console.debug, ...msgs);
}
info(...msgs) {
this.log(LOGGER_LEVEL.INFO, console.info, ...msgs);
}
warn(...msgs) {
this.log(LOGGER_LEVEL.WARNING, console.info, ...msgs);
}
error(...msgs) {
this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs);
}
notify(...msgs) {
let msg = msgs.join(' ');
if (!this._notificationId) {
chrome.notifications.create(
null,
{
"type": "basic",
"iconUrl": chrome.extension.getURL('icon.png'),
"title": "Data Extractor",
"message": msg,
"priority": 0,
"requireInteraction": true,
},
notificationId => {
this._notificationId = notificationId;
}
);
return;
}
chrome.notifications.update(
this._notificationId,
{ "message": msg }
);
}
}
const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);

View File

@ -1,24 +1,27 @@
/**
* Repeatedly sending a message to target tab until the response is detected good.
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval, limit = 0) {
function sendMessage(tab, req, log, dataChecker, interval, limit = 0) {
interval = interval || 500;
limit = limit && !isNaN(limit) ? limit : 0;
count = 0;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
// console.log("request for", req.action);
logger.debug("Request for", req.action);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
@ -30,16 +33,22 @@ function sendMessage(tab, req, log, cond, interval, limit = 0) {
return;
}
count++;
chrome.tabs.sendMessage(tab.id, req, r => {
chrome.tabs.sendMessage(tab.id, req, async r => {
// check error but do nothing.
// do not interrupt promise chains even if error, or the task always fail when:
// a tab is newly created, and the content scripts won't have time to initialize
chrome.runtime.lastError;
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
let err = chrome.runtime.lastError;
let result = r;
if (dataChecker) {
result = await dataChecker(r, err, count);
if (MSG_USER_ABORT.isEqual(result)) {
reject(MSG_USER_ABORT.message);
}
}
let flag = result !== undefined && result !== null;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(r);
resolve(result);
} else {
setTimeout(() => {
loop();
@ -50,10 +59,18 @@ function sendMessage(tab, req, log, cond, interval, limit = 0) {
});
}
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (!message.action || !message.action.startsWith(EXT_NAME)) {
chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) {
if (!request.action || !request.action.startsWith(EXT_NAME)) {
return;
}
sendResponse("Calling from user pages is not allowed.");
return;
switch (request.action) {
case ACTION_UPLOAD_STATE:
sendResponse('recieved!');
__EXTRACTOR_STATE__ = request.state;
logger.info(`State (${request.name}) recieved. To load it: some_var = new Extractor().load()`);
break;
default:
sendResponse("Request not supported.");
break;
}
});

View File

@ -21,6 +21,9 @@ class ExtractResult {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';

View File

@ -17,6 +17,15 @@ class Task {
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state) {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls() {
return this._urls;
}
@ -33,6 +42,7 @@ class Task {
}
clean() {
this._data = {};
this._data_keys = [];
}
async execute(tab, upstreamData) {
if (!tab) return Promise.reject("No tab to execute the task.");

View File

@ -1,7 +1,12 @@
(function () {
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
if (asleep && ACTION_WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
@ -16,6 +21,8 @@
return data;
case ACTION_GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
return request.url;
case ACTION_REPORT_IN:
return request.action;
@ -29,6 +36,12 @@
1000,
10
)
case ACTION_SLEEP:
asleep = true;
return "Content script is sleeping.";
case ACTION_WAKEUP:
asleep = false;
return "Content script is available.";
default:
break;
}

View File

@ -1,12 +1,10 @@
const EXT_NAME = "DataExtracter";
const URL_REG = getWebUrl();
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");
const ACTION_UPLOAD_STATE = `${EXT_NAME}:UploadStateFile`;
const ACTION_SLEEP = `${EXT_NAME}:Sleep`;
const ACTION_WAKEUP = `${EXT_NAME}:WakeUp`;

View File

@ -9,6 +9,11 @@ class ConstMessage {
}
}
const URL_REG = getWebUrl();
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL");
const MSG_USER_ABORT = new ConstMessage(100, "Tasks stopped by user.");
function saveFile(data, mimeType, fileName) {
fileName = fileName || document.title || "result";
var blob;