Compare commits

...

16 Commits

Author SHA1 Message Date
e87e7010ec improvements
* chance to continue on mismatch url for redirectTab
* support empty field selectors
* add Extractor.results()
* add ExtractResult.walk(), ExtractResult.visit()
* add ! directive to click elements
* code optimize
2021-04-20 14:20:05 +08:00
108ebb835f fix task running state 2021-04-20 12:01:10 +08:00
e0b0a5e986 add timeout for messaging 2021-04-20 12:00:59 +08:00
9cd25e3c1d update url 2021-04-19 15:58:04 +08:00
7827d385bd refactor 2020-06-16 14:45:36 +08:00
ade0670415 update readme 2020-01-17 11:01:13 +08:00
63aec616b1 code optimize 2020-01-17 09:38:40 +08:00
378883b626 check url change before extract data 2020-01-16 15:11:49 +08:00
c78f593c70 code optimize 2020-01-16 09:59:19 +08:00
d82010686d Extractor.watch() improvements
- only watch current window
- stop watch on window close
- don't ask user to confirm when fails
2020-01-15 18:28:28 +08:00
7644a1363f Extractor.watch() 2020-01-15 17:53:23 +08:00
3338f78d91 code optimize 2020-01-15 15:21:17 +08:00
da7ae057f4 Extractor.stop() 2020-01-15 14:18:31 +08:00
2224db1ad1 incognito window first 2020-01-15 14:05:57 +08:00
790c95ffc3 clean state cache in 30 seconds 2020-01-14 17:03:14 +08:00
f06a6f4e78 migrate to typescript, with fixes 2020-01-14 16:37:50 +08:00
35 changed files with 5792 additions and 666 deletions

154
.gitignore vendored
View File

@ -1,2 +1,154 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.gitignore.io/api/visualstudiocode,macos,node
# Edit at https://www.gitignore.io/?templates=visualstudiocode,macos,node
### macOS ###
# General
.DS_Store
Thumbs.db
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# rollup.js default build output
dist/
# Uncomment the public line if your project uses Gatsby
# https://nextjs.org/blog/next-9-1#public-directory-support
# https://create-react-app.dev/docs/using-the-public-folder/#docsNav
# public
# Storybook build outputs
.out
.storybook-out
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Temporary folders
tmp/
temp/
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
# End of https://www.gitignore.io/api/visualstudiocode,macos,node
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

View File

@ -1,45 +0,0 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.5.0",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
"16": "icon.png",
"48": "icon.png",
"128": "icon.png"
},
"browser_action": {
"default_icon": "icon.png",
"default_popup": "popup/tip.html",
"default_title": "Data Extracter"
},
"background": {
"scripts": [
"scripts/shared/tools.js",
"scripts/shared/common.js",
"scripts/background/logger.js",
"scripts/background/messaging.js",
"scripts/background/result.js",
"scripts/background/signiture.js",
"scripts/background/actions.js",
"scripts/background/task.js",
"scripts/background/extractor.js",
"scripts/background/helpers.js"
],
"persistent": false
},
"content_scripts": [{
"matches": ["*://*/*"],
"js": [
"scripts/shared/tools.js",
"scripts/shared/common.js",
"scripts/content/content.js"
],
"run_at": "document_idle"
}],
"permissions": [
"activeTab",
"notifications"
]
}

4433
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

17
package.json Normal file
View File

@ -0,0 +1,17 @@
{
"name": "data-extractor",
"scripts": {
"dev": "webpack --mode=development --devtool=inline-source-map --watch",
"prod": "webpack --mode=production"
},
"devDependencies": {
"@types/chrome": "0.0.91",
"@types/node": "^13.1.6",
"copy-webpack-plugin": "^5.1.1",
"ts-loader": "^6.2.1",
"tslint": "^5.20.1",
"typescript": "^3.7.4",
"webpack": "^4.41.5",
"webpack-cli": "^3.3.10"
}
}

View File

@ -8,15 +8,21 @@ All you need to do is:
- Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png)
![](template/assets/console.png)
## Qucik Start
Extract current page
```js
$('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start();
// fieldSelectors can be empty strings if items have no child to select
new Extractor().task('.item a', ['', '@href']).start();
```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
Extract multiple pages (1-10, interval 1)
```js
@ -52,12 +58,20 @@ function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`.
Close the target tab, in which current tasks is running.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> If you typed wrong selectors, the task waits forever for elements which don't exists.
Or use `job.stop()`:
## Extract Attributes.
```js
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
job.stop();
```
> Next time you call `job.start();`, the job will continues from where it stopped.
## Extract Attributes
e.g.: link text and target (use 'selector@attribute')
@ -65,6 +79,14 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start();
```
## Click Selected Elements
The following clicks selected links and extracts link `text` and `href`
```js
new Extractor().task('.item', ['!a', 'a@href']).start();
```
## Advanced Usage
### Use Task Chain.
@ -126,17 +148,17 @@ e.export(1)
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks (with following), even it stops in the middle of a task:
You can always continue tasks by start it again, not matter in what phase it stops.
```js
e.start()
```
The `Extractor` kept the state of last execution, and starts from where it stopped.
The `Extractor` kept the execution state, and starts from where it stopped.
### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task?
What if I don't like to continue from last state, but restart certain tasks?
```js
// restart all tasks
@ -166,8 +188,59 @@ e.save();
Load the state:
Open the popup window, upload the saved state file. Then, and in the backgoud console:
Open the popup window, upload the saved state file. Then, and in the backgroud console:
```js
e = new Extractor().load();
e.start();
```
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Watch Mode
Watch mode tries to exract data from every page you visit **in current window**.
```js
e = new Extractor();
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]);
e.watch(1); // start watching for first task
```
To stop watching, you can either `close current window`, or:
```js
e.stop();
```
## Results Operation
To get the results of a task:
```js
let results = job.results(0);
```
Visit URLs (if any) in the results one by one:
```js
results.visit();
```
Walk through all results one by one:
```js
results.walk((row,col,value)=>{console.log(value)});
```
## Developpment
Clone this project and execute:
```sh
npm i
npm run prod
# or
npm run dev
```

View File

@ -1,147 +0,0 @@
function parseUrls(...args) {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
function redirectTab(tab, url) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req = {
action: ACTION_GOTO_URL,
url: url
}
let checker = async (url, err, tryCount) => {
let newURL = await queryUrl(tab).catch(() => { });
if (newURL == url) return url;
if (
tryCount % 5 == 0 &&
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
) {
return MSG_USER_ABORT;
}
return undefined;
}
return sendMessage(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = {
action: ACTION_EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let checker = (result, err, tryCount) => {
if (MSG_ELEMENT_NOT_FOUND.isEqual(result)) {
if (tryCount % 20 == 0) {
if (confirm('No data found in current page. \n\nContinue to next page?')) {
return [];
}
} else {
return undefined;
}
}
return result;
};
return sendMessage(tab, req, 'Extract data from the tab...', checker);
}
/**
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
async function ping(tab, count = 1) {
let req = {
action: ACTION_REPORT_IN
}
let checker = r => r == req.action ? req.action : undefined;
let pong = await sendMessage(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { });
return pong == ACTION_REPORT_IN;
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab) {
let req = {
action: ACTION_QUERY_URL
}
return sendMessage(tab, req);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function scrollToBottom(tab) {
let req = {
action: ACTION_SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
async function createTab(url, active) {
return new Promise((resolve, reject) => {
chrome.tabs.create({
'url': url,
'active': active
}, function (tab) {
resolve(tab);
})
})
}
async function getActiveTab(currentWindow) {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
async function getTabByID(id) {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

View File

@ -1,3 +0,0 @@
function $(...args) {
return new Extractor().task(...args).start();
}

View File

@ -1,76 +0,0 @@
/**
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, dataChecker, interval, limit = 0) {
interval = interval || 500;
limit = limit && !isNaN(limit) ? limit : 0;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
logger.debug("Request for", req.action);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
chrome.tabs.sendMessage(tab.id, req, async r => {
// check error but do nothing.
// do not interrupt promise chains even if error, or the task always fail when:
// a tab is newly created, and the content scripts won't have time to initialize
let err = chrome.runtime.lastError;
let result = r;
if (dataChecker) {
result = await dataChecker(r, err, count);
if (MSG_USER_ABORT.isEqual(result)) {
reject(MSG_USER_ABORT.message);
}
}
let flag = result !== undefined && result !== null;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(result);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) {
if (!request.action || !request.action.startsWith(EXT_NAME)) {
return;
}
switch (request.action) {
case ACTION_UPLOAD_STATE:
sendResponse('recieved!');
__EXTRACTOR_STATE__ = request.state;
logger.info(`State (${request.name}) recieved. To load it: some_var = new Extractor().load()`);
break;
default:
sendResponse("Request not supported.");
break;
}
});

View File

@ -1,37 +0,0 @@
class ExtractResult {
constructor(data) {
this._data = data || [];
}
row(index) {
return this._data[index];
}
column(index) {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash() {
return this._data.reduce((p, c) => p.concat(c), []);
}
get data() {
return this._data;
}
toString(rowsCount) {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
}

View File

@ -1,88 +0,0 @@
class Task {
_data = {};
_data_keys = [];
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state) {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
get fieldSelectors() {
return this._fieldSelectors;
}
clean() {
this._data = {};
this._data_keys = [];
}
async execute(tab, upstreamData) {
if (!tab) return Promise.reject("No tab to execute the task.");
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
urls = [await queryUrl(tab)];
}
}
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[i - 1];
saveResult(results, lastURL);
}
}
if (this._data[url]) return MSG_URL_SKIPPED;
let pms = redirectTab(tab, url);
if (this._options["scrollToBottom"]) {
pms = pms.then(() => scrollToBottom(tab));
}
return pms.then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[urls.length - 1];
saveResult(results, lastURL);
return;
}
}
);
}
}

View File

@ -1,114 +0,0 @@
(function () {
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
if (asleep && ACTION_WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
async function doAction(request, sender) {
switch (request.action) {
case ACTION_EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors);
return data;
case ACTION_GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
return request.url;
case ACTION_REPORT_IN:
return request.action;
case ACTION_QUERY_URL:
return window.location.href;
case ACTION_SCROLL_BOTTOM:
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
)
case ACTION_SLEEP:
asleep = true;
return "Content script is sleeping.";
case ACTION_WAKEUP:
asleep = false;
return "Content script is available.";
default:
break;
}
}
function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound = {};
let items = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
let results = items.map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls));
if (!fieldVals.length) {
return;
}
fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
// if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn, detector, log, interval, limit) {
interval = interval || 500;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}
})();

View File

@ -1,10 +0,0 @@
const EXT_NAME = "DataExtracter";
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
const ACTION_UPLOAD_STATE = `${EXT_NAME}:UploadStateFile`;
const ACTION_SLEEP = `${EXT_NAME}:Sleep`;
const ACTION_WAKEUP = `${EXT_NAME}:WakeUp`;

File diff suppressed because one or more lines are too long

202
src/background/actions.ts Normal file
View File

@ -0,0 +1,202 @@
import { Actions, Request } from "../common";
import { sendMessage, ResponseChecker } from "./messaging";
import { logger } from "../common/logger";
/**
* redirect tab to url.
* @param {any} tab target tab
* @param {string} url target URL
* @returns {Promise<string[]>} a promise of target URL
*/
export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req: Request = {
action: Actions.GOTO_URL,
url: url
}
let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
let queryErr: any;
let newURL = await queryUrl(tab).catch(e => queryErr = e);
if (queryErr) {
throw queryErr;
}
if (newURL == url) return url;
if (
confirm(`Cannot navigate to target url.
expected: ${url}\n
actual: ${newURL}\n
Press OK to continue, Cancel to retry. Close the tab to stop`)
) {
return newURL;
}
return undefined;
}
return sendMessage<string>(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], expectedURL?: string, askOnfail?: boolean) {
let req: Request = {
action: Actions.EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors,
url: expectedURL,
}
let checker: ResponseChecker<string[][]> = (response, err, tryCount) => {
if (response.error) throw response.error;
let result = response.result;
if (!result || !result.length) {
if (
tryCount % 20 == 0 && (
!askOnfail ||
confirm('No data found in current page. \n\nContinue to next page?')
)
) {
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
return [];
} else {
return undefined;
}
}
return result;
};
return sendMessage<string[][]>(tab, req, 'Extract data from the tab...', checker);
}
/**
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
export async function ping(tab, count = 1) {
let req = {
action: Actions.PING
}
let checker: ResponseChecker<string> = (r, e, c) =>
r.result == "pong" ? r.result : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, 1000, count).catch(() => { });
return pong == "pong";
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the url
*/
export function queryUrl(tab: chrome.tabs.Tab) {
let req = {
action: Actions.QUERY_URL
}
return sendMessage<string>(tab, req);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
export function scrollToBottom(tab: chrome.tabs.Tab) {
let req = {
action: Actions.SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
findIncognitoWindow().then(
incognitoWindow => {
chrome.tabs.create({
'url': url,
'active': active,
// createTab to incognito window first
'windowId': incognitoWindow ? incognitoWindow.id : undefined
}, function (tab) {
resolve(tab);
})
}
);
});
}
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getAll(
{
windowTypes: ['normal'],
},
(windows: chrome.windows.Window[]) => {
for (let window of windows) {
if (window.incognito) {
resolve(window);
return;
}
}
resolve(undefined);
}
);
});
}
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getCurrent(
(windows: chrome.windows.Window) => {
return resolve(windows);
}
);
});
}
export async function getWindowByID(id: number) {
return new Promise<chrome.windows.Window>((resolve, reject) => {
chrome.windows.get(id, function (window) {
chrome.runtime.lastError;
resolve(window);
})
})
}
export async function CreateIncognitoWindow() {
return new Promise((resolve, reject) => {
chrome.windows.create(
<chrome.windows.CreateData>{
incognito: true,
},
(window: chrome.windows.Window) => {
resolve(window);
}
);
});
}
export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

31
src/background/caches.ts Normal file
View File

@ -0,0 +1,31 @@
import { logger } from "../common/logger";
import { Actions } from "../common";
import { messageSubscribers } from "./messaging";
export class Caches {
private _state: string = "";
constructor() {
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
sendResponse('recieved!');
this.setState(request.fileName, request.state)
});
}
get state(): string {
let s = this._state;
this._state = "";
return s;
}
setState(name: string, content: string) {
this._state = content;
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
// clear cache in 30 seconds
setTimeout(() => {
if (this._state) {
logger.info(`Uploaded state is cleaned after 30 second.`);
this._state = "";
}
}, 30000);
}
}
export const caches = new Caches();

View File

@ -1,10 +1,24 @@
var __EXTRACTOR_STATE__ = "";
import { Task } from "./task";
import { parseUrls, saveFile } from "./tools";
import { createTab, getActiveTab, ping, redirectTab } from "./actions";
import { logger } from "../common/logger";
import { caches } from "./caches";
import { ExtractResult } from "./result";
class Extractor {
constructor(options) {
this._tasks = [];
this._running = false;
this._options = options;
export class Extractor {
private _tasks: Task[] = [];
private _running = false;
private _options: any = {};
constructor(options?) {
if (options) this._options = options;
}
static async ping(count: number = 1) {
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab, count);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
}
/**
* Save current state, in case we restore it later.
@ -16,12 +30,12 @@ class Extractor {
* Restore previous state by loading from saved state.
*/
load() {
if (!__EXTRACTOR_STATE__) {
let content = caches.state;
if (!content) {
logger.info('No state found. Please upload a saved state from the popup window first.');
return;
}
let state = JSON.parse(__EXTRACTOR_STATE__);
__EXTRACTOR_STATE__ = "";
let state = JSON.parse(content);
this._options = state._options;
this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t));
return this;
@ -32,10 +46,21 @@ class Extractor {
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
task(...args: any) {
this._tasks.push(new Task(this._options, ...args));
return this;
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
results(id?: number): ExtractResult {
id = this._checkTaskId(id);
if (id < 0) return;
return this._tasks[id].results;
}
/**
* Clear tasks and task caches.
*/
@ -46,14 +71,30 @@ class Extractor {
/**
* Start the task chain.
*/
async start() {
start() {
return this._startTasks(0);
}
stop(id?: number) {
if (id !== undefined) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].stop();
return;
}
for (let i = 0; i < this._tasks.length; i++) {
this._tasks[i].stop();
}
}
watch(id: number) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].watch();
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from = 0) {
restart(from: number = 0) {
let id = this._checkTaskId(from, 0);
if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) {
@ -61,7 +102,7 @@ class Extractor {
}
return this._startTasks(0);
}
async _startTasks(from) {
async _startTasks(from: number) {
if (this._running) {
logger.info('The Extractor is running. Please wait..');
return;
@ -85,17 +126,17 @@ class Extractor {
}
}
this._running = true;
return this._tasks.reduce((pms, task, i) => {
return this._tasks.reduce((pms, task: Task, i: number) => {
return pms.then(
() => {
if (i < from) return;
if (i > 0) {
let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results));
return task.execute(tab, prevTask.results);
}
return task.execute(tab, undefined);
return task.execute(tab);
});
}, Promise.resolve(undefined)).then(
}, Promise.resolve<void>(undefined)).then(
() => {
this._running = false;
this.export();
@ -109,26 +150,26 @@ class Extractor {
* export result of a task to CSV
* @param {number} taskid which task id to save, begins with 0
*/
export(taskid) {
export(taskid?: number) {
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return;
let results = this._tasks[id].results
if (!results.length) {
let count = results.data.length
if (!count) {
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
results.unshift(this._tasks[id].fieldSelectors);
let exResults = new ExtractResult(results);
results.header = this._tasks[id].fieldSelectors;
let msg = `
Please confirm to download (${results.length - 1} items)
Please confirm to download (${count} items)
${exResults.toString(50) || "- Empty -"}
${results.toString(50) || "- Empty -"}
`.trim();
if (confirm(msg)) {
saveFile(exResults, "text/csv");
saveFile(results.toString(), "text/csv");
}
}
_checkTaskId(id, defaultId) {
private _checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) {
logger.info("No task found.");
return -1;

14
src/background/index.ts Normal file
View File

@ -0,0 +1,14 @@
import { Extractor } from "./extractor";
declare global {
interface Window {
$: (...args: any) => void;
Extractor: any;
}
}
window.$ = function (...args) {
return new Extractor().task(...args).start();
}
window.Extractor = Extractor;

150
src/background/messaging.ts Normal file
View File

@ -0,0 +1,150 @@
import { Request, Actions, Response } from "../common";
import { getTabByID } from "./actions";
import { logger } from "../common/logger";
export type ResponseCheckerSync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => T;
export type ResponseCheckerAsync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => Promise<T>;
export type ResponseChecker<T> = ResponseCheckerSync<T> | ResponseCheckerAsync<T>;
/**
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
export function sendMessage<T>(
tab: chrome.tabs.Tab,
req: Request,
log?: string,
dataChecker?: ResponseChecker<T>,
timeout?: number,
interval?: number,
limit?: number
) {
timeout = timeout || 10;
interval = interval || 500;
limit = isNaN(limit) ? 0 : limit;
let count = 0;
return new Promise<T>((resolve, reject) => {
loop();
async function loop() {
logger.debug("Request for", Actions[req.action]);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
let timeout = setTimeout(() => { reject(`${Actions[req.action]} requset timeout after ${timeout}s`) }, 10000);
chrome.tabs.sendMessage(tab.id, req, async (r: Response<T>) => {
clearTimeout(timeout);
// check error but do nothing until dataChecker.
let err = chrome.runtime.lastError;
let [result, error] = await checkResponse(dataChecker, r, err, count);
if (error) {
reject(error);
return;
}
let flag = result !== undefined;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(result);
} else {
setTimeout(() => {
logger.debug('Invalid response', r, 'retry...');
loop();
}, interval);
}
});
}
});
}
async function checkResponse<T>(
dataChecker: ResponseChecker<T>,
response: Response<T>,
error: chrome.runtime.LastError,
tryCount: number
): Promise<[T, string]> {
// response could be undefined if the content script is interrupted.
// don't check, tell sendMessage to retry.
if (!response) return [undefined, undefined];
if (!dataChecker) {
return [response.result, response.error];
}
let result: T;
let pms: T | Promise<T>;
try {
pms = dataChecker(response, error, tryCount);
} catch (err) {
return [undefined, err];
}
// don't catch if it's not a Promise
if (pms instanceof Promise) {
let checkerError: any;
pms = pms.catch(e => checkerError = e);
result = await pms;
if (checkerError) {
return [undefined, checkerError];
}
} else {
result = pms;
}
return [result, undefined];
}
export type ActionSubscriberSync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void;
export type ActionSubscriberAsync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => Promise<void>;
export type ActionSubscriber = ActionSubscriberSync | ActionSubscriberAsync;
class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
this.listeners[action].push(subscriber);
}
removeListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
for (let i = 0; i < this.listeners[action].length; i++) {
if (this.listeners[action][i] == subscriber) {
this.listeners[action].splice(i, 1);
i--;
}
}
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
}
getListeners(action: Actions): ActionSubscriber[] {
return this.listeners[action]
}
}
export const messageSubscribers = new MessageSubscribers();
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
let subscribers = messageSubscribers.getListeners(request.action);
if (!subscribers || !subscribers.length) {
sendResponse("Request not supported.");
return;
}
let promises: Promise<any>[] = [];
for (let subscriber of subscribers) {
let p = subscriber(request, sender, sendResponse);
if (p instanceof Promise) promises.push(p);
}
if (promises.length)
return Promise.all(promises);
return;
});

85
src/background/result.ts Normal file
View File

@ -0,0 +1,85 @@
import { logger } from "../common/logger";
import { getActiveTab, ping, redirectTab } from "./actions";
import { parseUrls } from "./tools";
export class ExtractResult {
private _header: string[];
private _data: string[][] = [];
constructor(data: string[][]) {
this._data = data || [];
}
row(index: number): string[] {
return this._data[index];
}
column(index: number): string[] {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash(): string[] {
return this._data.reduce((p, c) => p.concat(c), []);
}
set header(h: string[]) {
this._header = h
}
get data(): string[][] {
return this._data;
}
toString(rowsCount: number = 0): string {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
if (this._header && this._header.length) {
data.unshift(this._header);
}
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = cell || "";
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
async walk(fn: (row: number, col: number, value: string) => void) {
let pms = Promise.resolve(null);
for (let i = 0; i < this._data.length; i++) {
let cells = this._data[i];
for (let j = 0; j < cells.length; j++) {
let row = i;
let col = j;
let value = cells[j];
pms = pms.then(
() => fn(row, col, value)
)
}
}
return pms.catch(err => {
logger.error(err);
});
}
async visit() {
let urls = parseUrls(this);
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
return urls.reduce(
(pms, url: string, i: number) => {
return pms.then(
async () => {
return redirectTab(tab, url, false);
});
}, Promise.resolve<void>(undefined)
).catch(err => {
logger.error(err);
});
}
}

View File

@ -1,4 +1,6 @@
const signitures = `
import { ExtractResult } from "./result";
export const signitures = `
## Usage
// single task
$(...args);
@ -18,12 +20,13 @@ function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
$(".item", ["a", "a@href"]);
## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion
https://git.qjebbs.com/jebbs/data-extracter-extesion
`.trim();
function testArgs(...args) {
export function testArgs(...args: any) {
switch (args.length) {
case 0, 1:
case 0:
case 1:
return false;
case 2:
return args[0] && args[1] &&
@ -66,7 +69,3 @@ function testArgs(...args) {
return arr.reduce((p, c) => p && tester(c), true);
}
}
function argsToString(...args) {
return args.map(v => (v instanceof Array ? `[${v.join(', ')}]` : v.toString())).join(', ');
}

178
src/background/task.ts Normal file
View File

@ -0,0 +1,178 @@
import { parseUrls } from "./tools";
import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common";
import { logger } from "../common/logger";
export class Task {
private _data: { [key: string]: string[][] } = {};
private _data_keys: string[] = [];
private _options: any;
private _itemsSelector: string;
private _fieldSelectors: string[];
private _urls: string[] = [];
private _running = false;
private _listeners: ActionSubscriber[] = [];
constructor(options: any, ...arg: any);
constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, from: number, to: number, interval: number);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, pages: number[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], urls: string[]);
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state: any): Task {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls(): string[] {
return this._urls;
}
get results(): ExtractResult {
let rs: string[][] = this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
return new ExtractResult(rs);
}
get fieldSelectors(): string[] {
return this._fieldSelectors;
}
clean(): Task {
this.stop();
this._data = {};
this._data_keys = [];
return this;
}
stop() {
this._running = false;
let listener: ActionSubscriber;
while (listener = this._listeners.pop()) {
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
}
}
async watch() {
if (this._running) {
logger.info("The task is running. Please wait...");
return;
}
this._running = true;
let window = await findIncognitoWindow() || await getCurrentWindow();
if (!window) {
logger.info("No window to watch...");
return;
}
let watchTaskID = 0;
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id);
if (!findWindow) {
// stop watch on window close.
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
return;
}
// only watch current window.
if (sender.tab.windowId != window.id) return;
let taskID = watchTaskID++;
logger.info(`Watcher #${taskID} starts.`);
let pm = this.makeOptionalTasks(sender.tab);
return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, sender.tab.url, true)
).then(
results => {
if (results && results.length) {
this.saveResult(results, sender.tab.url);
}
logger.info(`Watcher #${taskID} ends.`);
}
).catch(
e => logger.error(`Watcher #${taskID} ends with:`, e)
)
}
this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
}
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) throw "No tab to execute the task.";
if (this._running) throw "The task is running. Please wait...";
this._running = true;
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
let tabURL: string;
await queryUrl(tab)
.then(u => {
tabURL = u;
})
.catch(() => {
e => {
this._running = false;
return Promise.reject(e);
}
});
urls = [tabURL];
}
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0 && results instanceof Array) {
let lastURL = urls[i - 1];
this.saveResult(results, lastURL);
}
if (this._data[url]) return;
let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
return pms
.then(() => this.makeOptionalTasks(tab))
.then(
() => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
);
}
), Promise.resolve<string[][]>(null)).then(
results => {
if (results && results.length) {
let lastURL = urls[urls.length - 1];
this.saveResult(results, lastURL);
}
this._running = false;
}
).catch(
e => {
this._running = false;
throw e;
}
);
}
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
let pm: Promise<any>;
if (this._options["scrollToBottom"]) {
pm = this.runningCheck(() => scrollToBottom(tab));
}
return pm;
}
private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) throw "The task is stopped by user.";
return fn();
}
private saveResult(results, key) {
if (this._data[key] === undefined) {
// do not add keys again
this._data_keys.push(key);
}
this._data[key] = results;
logger.info(`${results.length} items found.`)
}
}

62
src/background/tools.ts Normal file
View File

@ -0,0 +1,62 @@
import { ExtractResult } from "./result";
const URL_REG = /^\s*(https?):\/\//im;
export function parseUrls(...args): string[] {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
export function saveFile(data: string, mimeType: string, fileName?: string) {
fileName = fileName || document.title || "result";
let blob: Blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}

28
src/common/index.ts Normal file
View File

@ -0,0 +1,28 @@
export enum Actions {
// from background to content script
EXTRACT = 1,
GOTO_URL,
PING,
QUERY_URL,
SCROLL_BOTTOM,
SLEEP,
WAKEUP,
// from popup to background script
UPLOAD_STATE,
// from content to background script
REPORT_NEW_PAGE,
}
export interface Request {
action: Actions
itemsSelector?: string
fieldSelectors?: string[]
url?: string
fileName?: string
state?: string
}
export interface Response<T> {
result: T;
error: string;
}

View File

@ -1,42 +1,36 @@
const LOGGER_LEVEL = {
DEBUG: 1,
INFO: 2,
WARNING: 3,
ERROR: 4,
DISABLED: 100,
properties: {
1: { name: "debug", value: 1, prefix: "DEBUG" },
2: { name: "info", value: 2, prefix: "INFO" },
3: { name: "warning", value: 3, prefix: "WARN" },
4: { name: "error", value: 3, prefix: "ERROR" }
}
export enum LOGGER_LEVEL {
DEBUG = 1,
INFO,
WARN,
ERROR,
DISABLED,
};
class Logger {
_notificationId = undefined;
_log_level = LOGGER_LEVEL.INFO;
_notify_level = LOGGER_LEVEL.ERROR;
export class Logger {
private _notificationId = undefined;
private _log_level = LOGGER_LEVEL.INFO;
private _notify_level = LOGGER_LEVEL.ERROR;
constructor(logLevel, notifyLevel) {
if (logLevel) this._log_level = logLevel;
if (notifyLevel) this._notify_level = notifyLevel;
chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
if (chrome.notifications) chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
}
get logLevel() {
return this._log_level;
}
set logLevel(val) {
set logLevel(val: LOGGER_LEVEL) {
this._log_level = val;
}
get notifyLevel() {
return this._notify_level;
}
set notifyLevel(val) {
set notifyLevel(val: LOGGER_LEVEL) {
this._notify_level = val;
}
log(level, loggerFn, ...msgs) {
log(level: LOGGER_LEVEL, loggerFn: Function, ...msgs) {
if (level < this._log_level) return;
let time = new Date().toLocaleString();
loggerFn(`${time} [${LOGGER_LEVEL.properties[level].prefix}]`, ...msgs);
loggerFn(`${time} [${LOGGER_LEVEL[level]}]`, ...msgs);
if (level < this._notify_level) return;
this.notify(...msgs);
}
@ -47,7 +41,7 @@ class Logger {
this.log(LOGGER_LEVEL.INFO, console.info, ...msgs);
}
warn(...msgs) {
this.log(LOGGER_LEVEL.WARNING, console.info, ...msgs);
this.log(LOGGER_LEVEL.WARN, console.info, ...msgs);
}
error(...msgs) {
this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs);
@ -78,4 +72,4 @@ class Logger {
}
}
const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);
export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);

101
src/content/actions.ts Normal file
View File

@ -0,0 +1,101 @@
import { logger } from "../common/logger";
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
if (expectedURL && location.href != expectedURL) {
throw 'Target tab URL changed, aborting...';
}
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound: { [key: string]: boolean } = {};
let items: Element[] = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return [];
let results: string[][] = items.map(
item => {
return fieldSelectors.map(
selector => {
let doClick = false;
if (selector.startsWith("!")) {
doClick = true;
selector = selector.substring(1);
}
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldElements: Element[];
cls = cls.trim()
if (cls != "") {
fieldElements = Array.from(item.querySelectorAll(cls));
} else {
fieldElements = [item];
}
if (!fieldElements.length) {
return;
}
fieldFound[selector] = true;
return fieldElements.map(find => {
if (doClick) {
let e = document.createEvent("MouseEvents");
e.initEvent("click", true, true);
find.dispatchEvent(e);
}
return attr ? find[attr] : find.textContent.trim();
}).join('\n')
}
)
}
);
// TODO: configurable wait logic
// if it exists a field, which is not found in any row, the sender should retry.
let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
let shouldWait = notFoundFields.length > 0;
if (shouldWait) {
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
}
return shouldWait ? [] : results;
}
export function scrollToBottom() {
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
);
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn: () => void, detector: () => boolean, log: string, interval: number, limit: number) {
interval = interval || 500;
let count = 0;
return new Promise<boolean>((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}

75
src/content/index.ts Normal file
View File

@ -0,0 +1,75 @@
import { Request, Actions, Response } from '../common';
import { scrollToBottom, extract } from './actions';
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
if (!request.action) return;
if (asleep && Actions.WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE,
});
async function doAction(request: Request, sender: chrome.runtime.MessageSender): Promise<Response<any>> {
let result: any;
let error: string;
try {
switch (request.action) {
case Actions.EXTRACT:
result = extract(request.itemsSelector, request.fieldSelectors, request.url);
break;
case Actions.GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
result = request.url;
break;
case Actions.PING:
result = "pong";
break;
case Actions.QUERY_URL:
result = window.location.href;
break;
case Actions.SCROLL_BOTTOM:
result = scrollToBottom();
break;
case Actions.SLEEP:
asleep = true;
result = "Content script is sleeping.";
break;
case Actions.WAKEUP:
asleep = false;
result = "Content script is available.";
break;
default:
error = 'Unsupported action.'
break;
}
} catch (err) {
if (err instanceof Error) {
error = err.message;
} else {
error = err;
}
}
return newResponse(result, error);
}
function newResponse<T>(result: T, err?: string): Response<T> {
let r: Response<T> = {
result: result,
error: err,
}
return r;
}

View File

@ -1,3 +1,5 @@
import { Request, Actions } from '../common';
window.onload = function () {
document.querySelector('#link-extension-detail')
.addEventListener('click', () => {
@ -8,7 +10,7 @@ window.onload = function () {
document.querySelector('#link-document')
.addEventListener('click', () => {
chrome.tabs.create({
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion`
'url': `https://git.qjebbs.com/jebbs/data-extracter-extesion`
});
})
document.querySelector('#state-input')
@ -19,10 +21,10 @@ window.onload = function () {
reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) {
var fileString = evt.target.result;
chrome.runtime.sendMessage({
action: ACTION_UPLOAD_STATE,
chrome.runtime.sendMessage(<Request>{
action: Actions.UPLOAD_STATE,
state: fileString,
name: fileName
fileName: fileName
}, r => {
if (r) console.log('State sent:', r);
});

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -3,10 +3,9 @@
<link>
<meta charset="utf-8">
<title>Data Extractor</title>
<script charset="UTF-8" type="text/javascript" src="../scripts/shared/common.js"></script>
<script charset="UTF-8" type="text/javascript" src="tip.js"></script>
<script charset="UTF-8" type="text/javascript" src="../scripts/popup.bundle.js"></script>
<link rel="stylesheet" href="styles/bootstrap.min.css">
<link rel="stylesheet" href="../assets/bootstrap.min.css">
</head>
<body style="margin: 20px 10px;">
@ -19,13 +18,12 @@
<div class="row">
<div class="col">
<div class="alert alert-info small">
<!-- <h6>Usage:</h6> -->
<p>
Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page",
and type your scripts in the console.
</p>
<p>
<img src="../images/console.png" alt=""
<img src="../assets/console.png" alt=""
style="max-width: 489px; width: 100%; border-radius: 5px">
</p>
@ -54,7 +52,7 @@
<p>
<b>Full document at:</b>
<br>
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
<a href="#" id="link-document">https://git.qjebbs.com/jebbs/data-extracter-extesion</a>
</p>
</div>
</div>
@ -66,7 +64,7 @@
</div>
<div class="row">
<div class="col">
<input type="file" name="state" id="state-input">
<input type="file" name="state" id="state-input">
</div>
</div>
</div>

View File

Before

Width:  |  Height:  |  Size: 4.1 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

35
template/manifest.json Executable file
View File

@ -0,0 +1,35 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.5.1",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
"16": "icon.png",
"48": "icon.png",
"128": "icon.png"
},
"browser_action": {
"default_icon": "icon.png",
"default_popup": "html/popup.html",
"default_title": "Data Extracter"
},
"background": {
"scripts": [
"scripts/background.bundle.js"
],
"persistent": false
},
"content_scripts": [{
"matches": ["*://*/*"],
"js": [
"scripts/content.bundle.js"
],
"run_at": "document_idle"
}],
"incognito": "spanning",
"permissions": [
"activeTab",
"notifications"
]
}

12
tsconfig.json Normal file
View File

@ -0,0 +1,12 @@
{
"compilerOptions": {
"module": "commonjs",
"target": "es6",
"noImplicitAny": false,
"sourceMap": true,
"rootDir": "src",
"outDir": "dist/js",
"noEmitOnError": true,
"typeRoots": [ "node_modules/@types" ]
}
}

33
webpack.config.js Normal file
View File

@ -0,0 +1,33 @@
const path = require('path');
const CopyPlugin = require('copy-webpack-plugin');
module.exports = {
mode: 'production',
entry: {
background: './src/background/index.ts',
content: './src/content/index.ts',
popup: './src/popup/index.ts',
},
// devtool: 'inline-source-map',
output: {
path: path.resolve(__dirname, 'dist'),
filename: 'scripts/[name].bundle.js'
},
module: {
rules: [
{
test: /\.tsx?$/,
use: 'ts-loader',
exclude: /node_modules/
}
]
},
resolve: {
extensions: ['.tsx', '.ts', '.js']
},
plugins: [
new CopyPlugin([
{ from: '**/*', to: '.', toType: "dir" },
], { context: 'template', logLevel: 'warn' }),
]
};