Compare commits
27 Commits
c504942144
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| e87e7010ec | |||
| 108ebb835f | |||
| e0b0a5e986 | |||
| 9cd25e3c1d | |||
| 7827d385bd | |||
| ade0670415 | |||
| 63aec616b1 | |||
| 378883b626 | |||
| c78f593c70 | |||
| d82010686d | |||
| 7644a1363f | |||
| 3338f78d91 | |||
| da7ae057f4 | |||
| 2224db1ad1 | |||
| 790c95ffc3 | |||
| f06a6f4e78 | |||
| 3d375261df | |||
| 13e233fbe7 | |||
| 21d3dfb247 | |||
| 97c8aac58d | |||
| 09112bb506 | |||
| c7f4fe7cc4 | |||
| f1cf32b83a | |||
| 341abebc66 | |||
| 0cf04c3f79 | |||
| 6134289d0a | |||
| 0e62d914c1 |
154
.gitignore
vendored
154
.gitignore
vendored
@ -1,2 +1,154 @@
|
|||||||
|
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
|
||||||
|
|
||||||
|
# Created by https://www.gitignore.io/api/visualstudiocode,macos,node
|
||||||
|
# Edit at https://www.gitignore.io/?templates=visualstudiocode,macos,node
|
||||||
|
|
||||||
|
### macOS ###
|
||||||
|
# General
|
||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Icon must end with two \r
|
||||||
|
Icon
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
|
||||||
|
### Node ###
|
||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
|
||||||
|
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||||
|
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
|
||||||
|
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||||
|
lib-cov
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||||
|
.grunt
|
||||||
|
|
||||||
|
# Bower dependency directory (https://bower.io/)
|
||||||
|
bower_components
|
||||||
|
|
||||||
|
# node-waf configuration
|
||||||
|
.lock-wscript
|
||||||
|
|
||||||
|
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||||
|
build/Release
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
jspm_packages/
|
||||||
|
|
||||||
|
# TypeScript v1 declaration files
|
||||||
|
typings/
|
||||||
|
|
||||||
|
# TypeScript cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional eslint cache
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variables file
|
||||||
|
.env
|
||||||
|
.env.test
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
|
||||||
|
# next.js build output
|
||||||
|
.next
|
||||||
|
|
||||||
|
# nuxt.js build output
|
||||||
|
.nuxt
|
||||||
|
|
||||||
|
# rollup.js default build output
|
||||||
|
dist/
|
||||||
|
|
||||||
|
# Uncomment the public line if your project uses Gatsby
|
||||||
|
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||||
|
# https://create-react-app.dev/docs/using-the-public-folder/#docsNav
|
||||||
|
# public
|
||||||
|
|
||||||
|
# Storybook build outputs
|
||||||
|
.out
|
||||||
|
.storybook-out
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless/
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
|
|
||||||
|
# Temporary folders
|
||||||
|
tmp/
|
||||||
|
temp/
|
||||||
|
|
||||||
|
### VisualStudioCode ###
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
|
||||||
|
### VisualStudioCode Patch ###
|
||||||
|
# Ignore all local history of files
|
||||||
|
.history
|
||||||
|
|
||||||
|
# End of https://www.gitignore.io/api/visualstudiocode,macos,node
|
||||||
|
|
||||||
|
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 24 KiB |
4433
package-lock.json
generated
Normal file
4433
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
17
package.json
Normal file
17
package.json
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"name": "data-extractor",
|
||||||
|
"scripts": {
|
||||||
|
"dev": "webpack --mode=development --devtool=inline-source-map --watch",
|
||||||
|
"prod": "webpack --mode=production"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/chrome": "0.0.91",
|
||||||
|
"@types/node": "^13.1.6",
|
||||||
|
"copy-webpack-plugin": "^5.1.1",
|
||||||
|
"ts-loader": "^6.2.1",
|
||||||
|
"tslint": "^5.20.1",
|
||||||
|
"typescript": "^3.7.4",
|
||||||
|
"webpack": "^4.41.5",
|
||||||
|
"webpack-cli": "^3.3.10"
|
||||||
|
}
|
||||||
|
}
|
||||||
14
popup/tip.js
14
popup/tip.js
@ -1,14 +0,0 @@
|
|||||||
window.onload = function () {
|
|
||||||
document.querySelector('#link-extension-detail')
|
|
||||||
.addEventListener('click', () => {
|
|
||||||
chrome.tabs.create({
|
|
||||||
'url': `chrome://extensions/?id=${chrome.runtime.id}`
|
|
||||||
});
|
|
||||||
})
|
|
||||||
document.querySelector('#link-document')
|
|
||||||
.addEventListener('click', () => {
|
|
||||||
chrome.tabs.create({
|
|
||||||
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion`
|
|
||||||
});
|
|
||||||
})
|
|
||||||
}
|
|
||||||
160
readme.md
160
readme.md
@ -8,15 +8,21 @@ All you need to do is:
|
|||||||
- Find out the selectors for target data
|
- Find out the selectors for target data
|
||||||
- Type scripts in the console of `extension backgroud page`, as introduced bellow.
|
- Type scripts in the console of `extension backgroud page`, as introduced bellow.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
## Qucik Start
|
## Qucik Start
|
||||||
|
|
||||||
Extract current page
|
Extract current page
|
||||||
|
|
||||||
```js
|
```js
|
||||||
$('.item', ['a', 'a@href']);
|
$('.item', ['a', 'a@href']);
|
||||||
|
new Extractor().task('.item', ['a', 'a@href']).start();
|
||||||
|
// fieldSelectors can be empty strings if items have no child to select
|
||||||
|
new Extractor().task('.item a', ['', '@href']).start();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
|
||||||
|
|
||||||
Extract multiple pages (1-10, interval 1)
|
Extract multiple pages (1-10, interval 1)
|
||||||
|
|
||||||
```js
|
```js
|
||||||
@ -50,17 +56,22 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[])
|
|||||||
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Advanced Usage
|
## Stop Tasks
|
||||||
|
|
||||||
### Stop Tasks
|
Close the target tab, in which current tasks is running.
|
||||||
|
|
||||||
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
|
Or use `job.stop()`:
|
||||||
|
|
||||||
But if you typed wrong selectors, the task waits forever for elements which don't exists.
|
```js
|
||||||
|
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||||
|
.task('list-item', ["a.title", "p.content"])
|
||||||
|
.start();
|
||||||
|
job.stop();
|
||||||
|
```
|
||||||
|
|
||||||
The only way to stop tasks before its finish, is `Closing the host tab`.
|
> Next time you call `job.start();`, the job will continues from where it stopped.
|
||||||
|
|
||||||
### Extract Attributes.
|
## Extract Attributes
|
||||||
|
|
||||||
e.g.: link text and target (use 'selector@attribute')
|
e.g.: link text and target (use 'selector@attribute')
|
||||||
|
|
||||||
@ -68,20 +79,43 @@ e.g.: link text and target (use 'selector@attribute')
|
|||||||
new Extractor().task('.item', ['a', 'a@href']).start();
|
new Extractor().task('.item', ['a', 'a@href']).start();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Click Selected Elements
|
||||||
|
|
||||||
|
The following clicks selected links and extracts link `text` and `href`
|
||||||
|
|
||||||
|
```js
|
||||||
|
new Extractor().task('.item', ['!a', 'a@href']).start();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Usage
|
||||||
|
|
||||||
### Use Task Chain.
|
### Use Task Chain.
|
||||||
|
|
||||||
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
|
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
|
||||||
|
|
||||||
```js
|
```js
|
||||||
new Extractor()
|
e = new Extractor()
|
||||||
.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||||
.task('list-item', ["a.title", "p.content"])
|
.task('list-item', ["a.title", "p.content"])
|
||||||
.start();
|
.start();
|
||||||
```
|
```
|
||||||
|
|
||||||
### Save Result of Any Task
|
### Extractor Options
|
||||||
|
|
||||||
To a multiple task (chain) Extractor `e`:
|
Specify extra options, to make task do some actions before scrape the data.
|
||||||
|
|
||||||
|
```js
|
||||||
|
var job = new Extractor({ "scrollToBottom": 1 });
|
||||||
|
```
|
||||||
|
|
||||||
|
Available options:
|
||||||
|
|
||||||
|
- `scrollToBottom`: Try scroll pages to the bottom, some elements are loaded only we user need them.
|
||||||
|
|
||||||
|
|
||||||
|
### Export Result of Any Task
|
||||||
|
|
||||||
|
To a multiple task Extractor `e`:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
e = new Extractor()
|
e = new Extractor()
|
||||||
@ -90,27 +124,54 @@ e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
|||||||
.start();
|
.start();
|
||||||
```
|
```
|
||||||
|
|
||||||
User will be asked to save the final result when it finishes.
|
User will be asked to export the final result when it finishes.
|
||||||
|
|
||||||
Incase you want to save it again, use:
|
Incase you want to export it again, use:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
e.save()
|
e.export()
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to save another task's result, other than the final:
|
To export another task result, other than the final one:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
// save the result of first task
|
// export the result of first task
|
||||||
// to the example above, that is a list of urls
|
// to the example above, that is a list of urls
|
||||||
e.save(1)
|
e.export(0)
|
||||||
|
// export the result of second task
|
||||||
|
e.export(1)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Task Management
|
||||||
|
|
||||||
|
### Continue Tasks
|
||||||
|
|
||||||
|
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
|
||||||
|
|
||||||
|
You can always continue tasks by start it again, not matter in what phase it stops.
|
||||||
|
|
||||||
|
```js
|
||||||
|
e.start()
|
||||||
|
```
|
||||||
|
|
||||||
|
The `Extractor` kept the execution state, and starts from where it stopped.
|
||||||
|
|
||||||
### Restart Tasks
|
### Restart Tasks
|
||||||
|
|
||||||
In cases some later task fails, you don't need to restart all task.
|
What if I don't like to continue from last state, but restart certain tasks?
|
||||||
|
|
||||||
Here we have 2 tasks:
|
```js
|
||||||
|
// restart all tasks
|
||||||
|
e.restart(0)
|
||||||
|
// restart from 2nd task
|
||||||
|
e.restart(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Save & Load State
|
||||||
|
|
||||||
|
It may also be hard to finish tasks in even a single day, we need a way to save current state, and come back tommorow.
|
||||||
|
|
||||||
|
Create and run an extractor:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
e = new Extractor()
|
e = new Extractor()
|
||||||
@ -119,16 +180,67 @@ e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
|||||||
.start();
|
.start();
|
||||||
```
|
```
|
||||||
|
|
||||||
Suppose the second task fails, we can restart and continue from the task 2:
|
Save the state:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
e.restart(2);
|
e.save();
|
||||||
```
|
```
|
||||||
|
|
||||||
If you'd like restart all task, use:
|
Load the state:
|
||||||
|
|
||||||
|
Open the popup window, upload the saved state file. Then, and in the backgroud console:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
|
e = new Extractor().load();
|
||||||
e.start();
|
e.start();
|
||||||
// or
|
```
|
||||||
e.restart();
|
|
||||||
|
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
|
||||||
|
|
||||||
|
## Watch Mode
|
||||||
|
|
||||||
|
Watch mode tries to exract data from every page you visit **in current window**.
|
||||||
|
|
||||||
|
```js
|
||||||
|
e = new Extractor();
|
||||||
|
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
|
||||||
|
.task('list-item', ["a.title", "p.content"]);
|
||||||
|
e.watch(1); // start watching for first task
|
||||||
|
```
|
||||||
|
|
||||||
|
To stop watching, you can either `close current window`, or:
|
||||||
|
|
||||||
|
```js
|
||||||
|
e.stop();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results Operation
|
||||||
|
|
||||||
|
To get the results of a task:
|
||||||
|
|
||||||
|
```js
|
||||||
|
let results = job.results(0);
|
||||||
|
```
|
||||||
|
|
||||||
|
Visit URLs (if any) in the results one by one:
|
||||||
|
|
||||||
|
```js
|
||||||
|
results.visit();
|
||||||
|
```
|
||||||
|
|
||||||
|
Walk through all results one by one:
|
||||||
|
|
||||||
|
```js
|
||||||
|
results.walk((row,col,value)=>{console.log(value)});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Developpment
|
||||||
|
|
||||||
|
Clone this project and execute:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
npm i
|
||||||
|
npm run prod
|
||||||
|
# or
|
||||||
|
npm run dev
|
||||||
```
|
```
|
||||||
@ -1,168 +0,0 @@
|
|||||||
/**
|
|
||||||
* Extract data from current page / multiple urls.
|
|
||||||
* getData(tab, itemsSelector:string, fieldSelectors:string[])
|
|
||||||
* getData(tab, itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
|
||||||
* getData(tab, itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
|
||||||
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:string[])
|
|
||||||
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
|
||||||
* getData(itemsSelector:string, fieldSelectors:string[])
|
|
||||||
* getData(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
|
|
||||||
* getData(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
|
|
||||||
* getData(itemsSelector:string, fieldSelectors:string[], urls:string[])
|
|
||||||
* getData(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
|
|
||||||
* @param {...any} args
|
|
||||||
*/
|
|
||||||
async function getData(...args) {
|
|
||||||
let tab;
|
|
||||||
if (typeof args[0] !== 'string') tab = args.shift();
|
|
||||||
if (!testArgs(...args))
|
|
||||||
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
|
||||||
itemsSelector = args.shift();
|
|
||||||
fieldSelectors = args.shift();
|
|
||||||
let urls = parseUrls(...args);
|
|
||||||
let data = [];
|
|
||||||
if (!tab) tab = await getActiveTab(true) || await getActiveTab(false);
|
|
||||||
if (!tab) throw new Error("Cannot find active tab.");
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
let pms;
|
|
||||||
if (urls.length) {
|
|
||||||
pms = urls.reduce((p, url) => p.then(
|
|
||||||
results => {
|
|
||||||
if (results) data.push(...results);
|
|
||||||
return redirectTab(tab, url).then(
|
|
||||||
() => extractTabData(tab, itemsSelector, fieldSelectors)
|
|
||||||
);
|
|
||||||
},
|
|
||||||
() => p
|
|
||||||
), Promise.resolve([]));
|
|
||||||
} else {
|
|
||||||
pms = extractTabData(tab, itemsSelector, fieldSelectors);
|
|
||||||
}
|
|
||||||
pms.then(
|
|
||||||
results => {
|
|
||||||
if (results) data.push(...results);
|
|
||||||
data.unshift(fieldSelectors);
|
|
||||||
resolve(new ExtractResult(data));
|
|
||||||
},
|
|
||||||
err => reject(err)
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseUrls(...args) {
|
|
||||||
if (!args.length) return [];
|
|
||||||
let arg = args.shift();
|
|
||||||
if (arg instanceof Array) {
|
|
||||||
return arg;
|
|
||||||
} else if (arg instanceof ExtractResult) {
|
|
||||||
return arg.squash().filter(v => !!v);
|
|
||||||
} else {
|
|
||||||
let urlTempl = arg;
|
|
||||||
if (urlTempl) {
|
|
||||||
if (args[0] instanceof Array) {
|
|
||||||
return args[0].map(p => urlTempl.replace("${page}", p));
|
|
||||||
} else if (args.length >= 3) {
|
|
||||||
let urls = [];
|
|
||||||
let from = args.shift();
|
|
||||||
let to = args.shift();
|
|
||||||
let interval = args.shift();
|
|
||||||
for (let i = from; i <= to; i += interval) {
|
|
||||||
urls.push(urlTempl.replace("${page}", i));
|
|
||||||
}
|
|
||||||
return urls;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
function redirectTab(tab, url) {
|
|
||||||
let curUrl = "";
|
|
||||||
return queryUrl(tab, undefined, 'Query current url...')
|
|
||||||
.then(u => {
|
|
||||||
if (url !== u) {
|
|
||||||
curUrl = u;
|
|
||||||
let req = {
|
|
||||||
action: ACTION_GOTO_URL,
|
|
||||||
url: url
|
|
||||||
}
|
|
||||||
sendMessage(tab, req, `Goto url: ${url}`);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.then(() => queryUrl(tab, curUrl, 'Check if tab url matches expected...'))
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* extract data in from the target tab.
|
|
||||||
* @param {any} tab target tab
|
|
||||||
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
|
||||||
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
|
||||||
* @returns {Promise<string[]>} a promise of extracted data
|
|
||||||
*/
|
|
||||||
function extractTabData(tab, itemsSelector, fieldSelectors) {
|
|
||||||
let req = {
|
|
||||||
action: ACTION_EXTRACT,
|
|
||||||
itemsSelector: itemsSelector,
|
|
||||||
fieldSelectors: fieldSelectors
|
|
||||||
}
|
|
||||||
let cond = r => !MSG_ELEMENT_NOT_FOUND.isEqual(r);
|
|
||||||
return sendMessage(tab, req, 'Extract data from the tab...', cond);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get report in from the target tab, usually used to detect if the content script is ready.
|
|
||||||
* @param {any} tab target tab
|
|
||||||
* @returns {Promise<string>} a promise of the report in message
|
|
||||||
*/
|
|
||||||
function reportIn(tab) {
|
|
||||||
let req = {
|
|
||||||
action: ACTION_REPORT_IN
|
|
||||||
}
|
|
||||||
let cond = r => r == req.action;
|
|
||||||
return sendMessage(tab, req, 'Check tab availability...', cond);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* get the url of the target tab
|
|
||||||
* @param {any} tab target tab
|
|
||||||
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded
|
|
||||||
* @returns {Promise<string>} a promise of the url
|
|
||||||
*/
|
|
||||||
function queryUrl(tab, urlExcluded, log) {
|
|
||||||
let req = {
|
|
||||||
action: ACTION_QUERY_URL
|
|
||||||
}
|
|
||||||
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
|
|
||||||
return sendMessage(tab, req, log, cond);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function createTab(url, active) {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
chrome.tabs.create({
|
|
||||||
'url': url,
|
|
||||||
'active': active
|
|
||||||
}, function (tab) {
|
|
||||||
resolve(tab);
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async function getActiveTab(currentWindow) {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
chrome.tabs.query({
|
|
||||||
active: true,
|
|
||||||
currentWindow: currentWindow
|
|
||||||
}, function (tabs) {
|
|
||||||
resolve(tabs[0]);
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async function getTabByID(id) {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
chrome.tabs.get(id, function (tab) {
|
|
||||||
chrome.runtime.lastError;
|
|
||||||
resolve(tab);
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@ -1,151 +0,0 @@
|
|||||||
class Extractor {
|
|
||||||
constructor() {
|
|
||||||
this._tasks = [];
|
|
||||||
this._tab = undefined;
|
|
||||||
this._running = false;
|
|
||||||
this._results = {};
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Add a task to Extractor. \n
|
|
||||||
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
|
||||||
* Later task will use previous task result as input (target url list).
|
|
||||||
* So only the first task can have target url arguments, while later tasks can't.
|
|
||||||
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
|
||||||
*/
|
|
||||||
task(...args) {
|
|
||||||
if (!testArgs(...args)) {
|
|
||||||
console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
// given >2 arguments means the task specifies target page,
|
|
||||||
// so it won't accept last task result as url list.
|
|
||||||
// in this case, former tasks are useless, can be cleared.
|
|
||||||
if (args.length > 2) this.clear();
|
|
||||||
this._tasks.push(args);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Clear tasks and caches.
|
|
||||||
*/
|
|
||||||
clear() {
|
|
||||||
this._tasks = [];
|
|
||||||
this._results = [];
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Start the task chain.
|
|
||||||
*/
|
|
||||||
async start() {
|
|
||||||
if (this._running) {
|
|
||||||
console.log('The Extractor is running. Please wait..');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!this._tasks.length) {
|
|
||||||
console.log('No task to run.');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let firstTaskArgs = this._tasks[0];
|
|
||||||
if (firstTaskArgs.length > 2) {
|
|
||||||
// task specifies target urls, create new tab with first url for it
|
|
||||||
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
|
|
||||||
this._tab = await createTab(urls[0], false);
|
|
||||||
} else {
|
|
||||||
this._tab = await getActiveTab(false);
|
|
||||||
}
|
|
||||||
this._running = true;
|
|
||||||
return this._tasks.reduce((pms, args, i, tasks) => {
|
|
||||||
return pms.then(
|
|
||||||
result => {
|
|
||||||
if (result === undefined) return getData(this._tab, ...args);
|
|
||||||
this._results[tasks[i - 1]] = result;
|
|
||||||
return getData(this._tab, ...args, result);
|
|
||||||
});
|
|
||||||
}, Promise.resolve(undefined)).then(
|
|
||||||
result => {
|
|
||||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
|
||||||
this._running = false;
|
|
||||||
console.log("Tasks are all done.")
|
|
||||||
this.save();
|
|
||||||
}
|
|
||||||
).catch(err => {
|
|
||||||
this._running = false;
|
|
||||||
console.log(err)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* restart from specified task, but don't restart the previous tasks.
|
|
||||||
* @param {number} taskid from which restart the tasks
|
|
||||||
*/
|
|
||||||
async restart(taskid) {
|
|
||||||
if (this._running) {
|
|
||||||
console.log('The Extractor is running. Please wait..');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
taskid = this._checkTaskId(taskid, 1);
|
|
||||||
if (!taskid) return;
|
|
||||||
if (taskid == 1) {
|
|
||||||
this.start();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let cache = this._results[this._tasks[taskid - 2]];
|
|
||||||
if (!cache) {
|
|
||||||
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this._running = true;
|
|
||||||
this._tab = await createTab(parseUrls(cache)[0], false)
|
|
||||||
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
|
|
||||||
return pms.then(
|
|
||||||
result => {
|
|
||||||
this._results[tasks[i - 1]] = result;
|
|
||||||
return getData(this._tab, ...args, result);
|
|
||||||
});
|
|
||||||
}, Promise.resolve(cache)).then(
|
|
||||||
result => {
|
|
||||||
this._results[this._tasks[this._tasks.length - 1]] = result;
|
|
||||||
this._running = false;
|
|
||||||
this.save();
|
|
||||||
}
|
|
||||||
).catch(err => {
|
|
||||||
this._running = false;
|
|
||||||
console.log(err)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Save result of a task
|
|
||||||
* @param {number} taskid which task id to save.
|
|
||||||
*/
|
|
||||||
save(taskid) {
|
|
||||||
taskid = this._checkTaskId(taskid, this._tasks.length);
|
|
||||||
if (!taskid) return;
|
|
||||||
const result = this._results[this._tasks[taskid - 1]];
|
|
||||||
if (!result) {
|
|
||||||
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (result.data.length <= 1) { // 1 for selector headers
|
|
||||||
console.log(`No result for task #${taskid}. Forget to call ".start()"?`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let msg = `
|
|
||||||
Please confirm to download (${result.data.length - 1} items):
|
|
||||||
|
|
||||||
${result.toString(50) || "- Empty -"}
|
|
||||||
`.trim();
|
|
||||||
if (confirm(msg)) {
|
|
||||||
saveFile(result, "text/csv");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_checkTaskId(id, defaultId) {
|
|
||||||
if (!this._tasks.length) {
|
|
||||||
console.log("No task found.");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (defaultId && id === undefined || this.task === null) id = defaultId;
|
|
||||||
if (isNaN(id) || id < 1 || id > this._tasks.length) {
|
|
||||||
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,3 +0,0 @@
|
|||||||
function $(...args) {
|
|
||||||
return new Extractor().task(...args).start();
|
|
||||||
}
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
|
|
||||||
/**
|
|
||||||
* Repeatedly sending a message to target tab until the response is detected good.
|
|
||||||
* @param {object} tab the table where to send the message
|
|
||||||
* @param {object} req the request data.
|
|
||||||
* @param {function} cond success condition function, r:any=>boolean
|
|
||||||
* @param {number} interval interval for detecting
|
|
||||||
* @param {string} log messages logged to console.
|
|
||||||
* @return {Promise} a promise of the response.
|
|
||||||
*/
|
|
||||||
function sendMessage(tab, req, log, cond, interval) {
|
|
||||||
interval = interval || 500;
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
|
|
||||||
loop();
|
|
||||||
|
|
||||||
async function loop() {
|
|
||||||
// console.log("request for", req.action);
|
|
||||||
let tabAvailable = await getTabByID(tab.id);
|
|
||||||
if (!tabAvailable) {
|
|
||||||
reject("Task interrupted due to the target tab is closed.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
chrome.tabs.sendMessage(tab.id, req, r => {
|
|
||||||
if (chrome.runtime.lastError) {
|
|
||||||
reject(chrome.runtime.lastError.message);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let flag = !cond || cond(r);
|
|
||||||
if (log) console.log(log, flag ? '(OK)' : '(failed)');
|
|
||||||
if (flag) {
|
|
||||||
resolve(r);
|
|
||||||
} else {
|
|
||||||
setTimeout(() => {
|
|
||||||
loop();
|
|
||||||
}, interval);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
|
|
||||||
if (!message.action || !message.action.startsWith(EXT_NAME)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
sendResponse("Calling from user pages is not allowed.");
|
|
||||||
return;
|
|
||||||
});
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
class ExtractResult {
|
|
||||||
constructor(data) {
|
|
||||||
this._data = data || [];
|
|
||||||
|
|
||||||
}
|
|
||||||
row(index) {
|
|
||||||
return this._data[index];
|
|
||||||
}
|
|
||||||
column(index) {
|
|
||||||
return [...new Array(this._data.length).keys()].map(
|
|
||||||
i => this._data[i][index]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
squash() {
|
|
||||||
return this._data.reduce((p, c) => p.concat(c), []);
|
|
||||||
}
|
|
||||||
get data() {
|
|
||||||
return this._data;
|
|
||||||
}
|
|
||||||
toString(rowsCount) {
|
|
||||||
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
|
|
||||||
return data.slice().reduce(
|
|
||||||
(csv, lineCells) => {
|
|
||||||
let line = lineCells.reduce(
|
|
||||||
(lineText, cell, idx) => {
|
|
||||||
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
|
|
||||||
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
|
|
||||||
}, "");
|
|
||||||
return csv + line + "\n";
|
|
||||||
},
|
|
||||||
""
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,53 +0,0 @@
|
|||||||
chrome.runtime.onMessage.addListener(
|
|
||||||
function (request, sender, sendResponse) {
|
|
||||||
if (!request.action) return;
|
|
||||||
// console.log("Recieved request:",request);
|
|
||||||
switch (request.action) {
|
|
||||||
case ACTION_EXTRACT:
|
|
||||||
let data = extract(request.itemsSelector, request.fieldSelectors);
|
|
||||||
if (sendResponse) sendResponse(data);
|
|
||||||
break;
|
|
||||||
case ACTION_GOTO_URL:
|
|
||||||
window.location.replace(request.url);
|
|
||||||
if (sendResponse) sendResponse(request.url);
|
|
||||||
break;
|
|
||||||
case ACTION_REPORT_IN:
|
|
||||||
if (sendResponse) sendResponse(request.action);
|
|
||||||
break;
|
|
||||||
case ACTION_QUERY_URL:
|
|
||||||
if (sendResponse) sendResponse(window.location.href);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
function extract(itemsSelector, fieldSelectors) {
|
|
||||||
// since some elements may be loaded asynchronously.
|
|
||||||
// if one field is never found, we should return undefined,
|
|
||||||
// so that senders can detect to retry until elements loaded.
|
|
||||||
// If user writes wrong selectors, the task retries infinitely.
|
|
||||||
let fieldFound = {};
|
|
||||||
let items = Array.from(document.querySelectorAll(itemsSelector));
|
|
||||||
// items may not loaded yet, tell the sender to retry.
|
|
||||||
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
|
|
||||||
let results = items.map(
|
|
||||||
item => {
|
|
||||||
return fieldSelectors.map(
|
|
||||||
selector => {
|
|
||||||
let [cls, attr] = selector.split('@').slice(0, 2);
|
|
||||||
let fieldVals = Array.from(item.querySelectorAll(cls));
|
|
||||||
if (!fieldVals.length) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
fieldFound[selector] = true;
|
|
||||||
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
);
|
|
||||||
// if it exists a field, which is not found in any row, the sender should retry.
|
|
||||||
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
|
|
||||||
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
const EXT_NAME = "DataExtracter";
|
|
||||||
|
|
||||||
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
|
|
||||||
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
|
|
||||||
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
|
|
||||||
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
|
|
||||||
|
|
||||||
const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet");
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
class ConstMessage {
|
|
||||||
constructor(id, message) {
|
|
||||||
this.id = id;
|
|
||||||
this.message = message;
|
|
||||||
}
|
|
||||||
isEqual(err) {
|
|
||||||
if (!err || !err.id) return false;
|
|
||||||
return this.id == err.id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function saveFile(data, mimeType, fileName) {
|
|
||||||
fileName = fileName || document.title || "result";
|
|
||||||
var blob;
|
|
||||||
if (typeof window.Blob == "function") {
|
|
||||||
blob = new Blob([data], {
|
|
||||||
type: mimeType
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
var BlobBuiler = window.BlobBuilder || window.MozBlobBuilder || window.WebKitBlobBuilder || window.MSBlobBuilder;
|
|
||||||
var builer = new BlobBuiler();
|
|
||||||
builer.append(data);
|
|
||||||
blob = builer.getBlob(mimeType)
|
|
||||||
}
|
|
||||||
var URL = window.URL || window.webkitURL;
|
|
||||||
var url = URL.createObjectURL(blob);
|
|
||||||
var link = document.createElement("a");
|
|
||||||
if ('download' in link) {
|
|
||||||
link.style.visibility = "hidden";
|
|
||||||
link.href = url;
|
|
||||||
link.download = fileName;
|
|
||||||
document.body.appendChild(link);
|
|
||||||
var j = document.createEvent("MouseEvents");
|
|
||||||
j.initEvent("click", true, true);
|
|
||||||
link.dispatchEvent(j);
|
|
||||||
document.body.removeChild(link)
|
|
||||||
} else if (navigator.msSaveBlob) {
|
|
||||||
navigator.msSaveBlob(blob, fileName)
|
|
||||||
} else {
|
|
||||||
location.href = url
|
|
||||||
}
|
|
||||||
}
|
|
||||||
202
src/background/actions.ts
Normal file
202
src/background/actions.ts
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
import { Actions, Request } from "../common";
|
||||||
|
import { sendMessage, ResponseChecker } from "./messaging";
|
||||||
|
import { logger } from "../common/logger";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* redirect tab to url.
|
||||||
|
* @param {any} tab target tab
|
||||||
|
* @param {string} url target URL
|
||||||
|
* @returns {Promise<string[]>} a promise of target URL
|
||||||
|
*/
|
||||||
|
export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
|
||||||
|
return queryUrl(tab).then(u => {
|
||||||
|
if (url !== u) {
|
||||||
|
let req: Request = {
|
||||||
|
action: Actions.GOTO_URL,
|
||||||
|
url: url
|
||||||
|
}
|
||||||
|
let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
|
||||||
|
let queryErr: any;
|
||||||
|
let newURL = await queryUrl(tab).catch(e => queryErr = e);
|
||||||
|
if (queryErr) {
|
||||||
|
throw queryErr;
|
||||||
|
}
|
||||||
|
if (newURL == url) return url;
|
||||||
|
if (
|
||||||
|
confirm(`Cannot navigate to target url.
|
||||||
|
expected: ${url}\n
|
||||||
|
actual: ${newURL}\n
|
||||||
|
Press OK to continue, Cancel to retry. Close the tab to stop`)
|
||||||
|
) {
|
||||||
|
return newURL;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return sendMessage<string>(tab, req, `Goto url: ${url}`, checker);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* extract data in from the target tab.
|
||||||
|
* @param {any} tab target tab
|
||||||
|
* @param {string} itemsSelector items selectors for selecting items (data rows)
|
||||||
|
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
|
||||||
|
* @returns {Promise<string[]>} a promise of extracted data
|
||||||
|
*/
|
||||||
|
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], expectedURL?: string, askOnfail?: boolean) {
|
||||||
|
let req: Request = {
|
||||||
|
action: Actions.EXTRACT,
|
||||||
|
itemsSelector: itemsSelector,
|
||||||
|
fieldSelectors: fieldSelectors,
|
||||||
|
url: expectedURL,
|
||||||
|
}
|
||||||
|
let checker: ResponseChecker<string[][]> = (response, err, tryCount) => {
|
||||||
|
if (response.error) throw response.error;
|
||||||
|
let result = response.result;
|
||||||
|
if (!result || !result.length) {
|
||||||
|
if (
|
||||||
|
tryCount % 20 == 0 && (
|
||||||
|
!askOnfail ||
|
||||||
|
confirm('No data found in current page. \n\nContinue to next page?')
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
|
||||||
|
return [];
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
return sendMessage<string[][]>(tab, req, 'Extract data from the tab...', checker);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ping target tab, usually used to detect if the content script is ready.
|
||||||
|
* @param {any} tab target tab
|
||||||
|
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
|
||||||
|
*/
|
||||||
|
export async function ping(tab, count = 1) {
|
||||||
|
let req = {
|
||||||
|
action: Actions.PING
|
||||||
|
}
|
||||||
|
let checker: ResponseChecker<string> = (r, e, c) =>
|
||||||
|
r.result == "pong" ? r.result : undefined;
|
||||||
|
|
||||||
|
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, 1000, count).catch(() => { });
|
||||||
|
return pong == "pong";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the url of the target tab
|
||||||
|
* @param {any} tab target tab
|
||||||
|
* @returns {Promise<string>} a promise of the url
|
||||||
|
*/
|
||||||
|
export function queryUrl(tab: chrome.tabs.Tab) {
|
||||||
|
let req = {
|
||||||
|
action: Actions.QUERY_URL
|
||||||
|
}
|
||||||
|
return sendMessage<string>(tab, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the url of the target tab
|
||||||
|
* @param {any} tab target tab
|
||||||
|
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
|
||||||
|
* @returns {Promise<string>} a promise of the url
|
||||||
|
*/
|
||||||
|
export function scrollToBottom(tab: chrome.tabs.Tab) {
|
||||||
|
let req = {
|
||||||
|
action: Actions.SCROLL_BOTTOM
|
||||||
|
}
|
||||||
|
return sendMessage(tab, req, 'Scroll to page bottom...');
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
findIncognitoWindow().then(
|
||||||
|
incognitoWindow => {
|
||||||
|
chrome.tabs.create({
|
||||||
|
'url': url,
|
||||||
|
'active': active,
|
||||||
|
// createTab to incognito window first
|
||||||
|
'windowId': incognitoWindow ? incognitoWindow.id : undefined
|
||||||
|
}, function (tab) {
|
||||||
|
resolve(tab);
|
||||||
|
})
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
chrome.windows.getAll(
|
||||||
|
{
|
||||||
|
windowTypes: ['normal'],
|
||||||
|
},
|
||||||
|
(windows: chrome.windows.Window[]) => {
|
||||||
|
for (let window of windows) {
|
||||||
|
if (window.incognito) {
|
||||||
|
resolve(window);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resolve(undefined);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
chrome.windows.getCurrent(
|
||||||
|
(windows: chrome.windows.Window) => {
|
||||||
|
return resolve(windows);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getWindowByID(id: number) {
|
||||||
|
return new Promise<chrome.windows.Window>((resolve, reject) => {
|
||||||
|
chrome.windows.get(id, function (window) {
|
||||||
|
chrome.runtime.lastError;
|
||||||
|
resolve(window);
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function CreateIncognitoWindow() {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
chrome.windows.create(
|
||||||
|
<chrome.windows.CreateData>{
|
||||||
|
incognito: true,
|
||||||
|
},
|
||||||
|
(window: chrome.windows.Window) => {
|
||||||
|
resolve(window);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
chrome.tabs.query({
|
||||||
|
active: true,
|
||||||
|
currentWindow: currentWindow
|
||||||
|
}, function (tabs) {
|
||||||
|
resolve(tabs[0]);
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
chrome.tabs.get(id, function (tab) {
|
||||||
|
chrome.runtime.lastError;
|
||||||
|
resolve(tab);
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
31
src/background/caches.ts
Normal file
31
src/background/caches.ts
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import { logger } from "../common/logger";
|
||||||
|
import { Actions } from "../common";
|
||||||
|
import { messageSubscribers } from "./messaging";
|
||||||
|
|
||||||
|
export class Caches {
|
||||||
|
private _state: string = "";
|
||||||
|
constructor() {
|
||||||
|
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
|
||||||
|
sendResponse('recieved!');
|
||||||
|
this.setState(request.fileName, request.state)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
get state(): string {
|
||||||
|
let s = this._state;
|
||||||
|
this._state = "";
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
setState(name: string, content: string) {
|
||||||
|
this._state = content;
|
||||||
|
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
|
||||||
|
// clear cache in 30 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
if (this._state) {
|
||||||
|
logger.info(`Uploaded state is cleaned after 30 second.`);
|
||||||
|
this._state = "";
|
||||||
|
}
|
||||||
|
}, 30000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export const caches = new Caches();
|
||||||
184
src/background/extractor.ts
Normal file
184
src/background/extractor.ts
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
import { Task } from "./task";
|
||||||
|
import { parseUrls, saveFile } from "./tools";
|
||||||
|
import { createTab, getActiveTab, ping, redirectTab } from "./actions";
|
||||||
|
import { logger } from "../common/logger";
|
||||||
|
import { caches } from "./caches";
|
||||||
|
import { ExtractResult } from "./result";
|
||||||
|
|
||||||
|
export class Extractor {
|
||||||
|
private _tasks: Task[] = [];
|
||||||
|
private _running = false;
|
||||||
|
private _options: any = {};
|
||||||
|
constructor(options?) {
|
||||||
|
if (options) this._options = options;
|
||||||
|
}
|
||||||
|
static async ping(count: number = 1) {
|
||||||
|
let tab = await getActiveTab(true) || await getActiveTab(false);
|
||||||
|
let succ = await ping(tab, count);
|
||||||
|
if (!succ) {
|
||||||
|
logger.error('Cannot contact with active tab.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Save current state, in case we restore it later.
|
||||||
|
*/
|
||||||
|
save() {
|
||||||
|
saveFile(JSON.stringify(this), 'application/json', 'state.json');
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Restore previous state by loading from saved state.
|
||||||
|
*/
|
||||||
|
load() {
|
||||||
|
let content = caches.state;
|
||||||
|
if (!content) {
|
||||||
|
logger.info('No state found. Please upload a saved state from the popup window first.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let state = JSON.parse(content);
|
||||||
|
this._options = state._options;
|
||||||
|
this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Add a task to Extractor. \n
|
||||||
|
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
||||||
|
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
|
||||||
|
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
||||||
|
*/
|
||||||
|
task(...args: any) {
|
||||||
|
this._tasks.push(new Task(this._options, ...args));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Add a task to Extractor. \n
|
||||||
|
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
||||||
|
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
|
||||||
|
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
||||||
|
*/
|
||||||
|
results(id?: number): ExtractResult {
|
||||||
|
id = this._checkTaskId(id);
|
||||||
|
if (id < 0) return;
|
||||||
|
return this._tasks[id].results;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Clear tasks and task caches.
|
||||||
|
*/
|
||||||
|
clear() {
|
||||||
|
this._tasks = [];
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Start the task chain.
|
||||||
|
*/
|
||||||
|
start() {
|
||||||
|
return this._startTasks(0);
|
||||||
|
}
|
||||||
|
stop(id?: number) {
|
||||||
|
if (id !== undefined) {
|
||||||
|
id = this._checkTaskId(id);
|
||||||
|
if (id < 0) return;
|
||||||
|
this._tasks[id].stop();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (let i = 0; i < this._tasks.length; i++) {
|
||||||
|
this._tasks[i].stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
watch(id: number) {
|
||||||
|
id = this._checkTaskId(id);
|
||||||
|
if (id < 0) return;
|
||||||
|
this._tasks[id].watch();
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* restart from specified task, but don't restart the previous tasks.
|
||||||
|
* @param {number} from where to restart the tasks, begins with 0
|
||||||
|
*/
|
||||||
|
restart(from: number = 0) {
|
||||||
|
let id = this._checkTaskId(from, 0);
|
||||||
|
if (id < 0) return;
|
||||||
|
for (let i = id; i < this._tasks.length; i++) {
|
||||||
|
this._tasks[i].clean();
|
||||||
|
}
|
||||||
|
return this._startTasks(0);
|
||||||
|
}
|
||||||
|
async _startTasks(from: number) {
|
||||||
|
if (this._running) {
|
||||||
|
logger.info('The Extractor is running. Please wait..');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!this._tasks.length) {
|
||||||
|
logger.info('No task to run.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tab;
|
||||||
|
let task = this._tasks[0];
|
||||||
|
if (task.urls.length) {
|
||||||
|
// task specifies target urls, create new tab with first url for it
|
||||||
|
tab = await createTab(task.urls[0], false);
|
||||||
|
} else {
|
||||||
|
tab = await getActiveTab(true) || await getActiveTab(false);
|
||||||
|
let succ = await ping(tab);
|
||||||
|
if (!succ) {
|
||||||
|
logger.error('Cannot contact with active tab.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this._running = true;
|
||||||
|
return this._tasks.reduce((pms, task: Task, i: number) => {
|
||||||
|
return pms.then(
|
||||||
|
() => {
|
||||||
|
if (i < from) return;
|
||||||
|
if (i > 0) {
|
||||||
|
let prevTask = this._tasks[i - 1];
|
||||||
|
return task.execute(tab, prevTask.results);
|
||||||
|
}
|
||||||
|
return task.execute(tab);
|
||||||
|
});
|
||||||
|
}, Promise.resolve<void>(undefined)).then(
|
||||||
|
() => {
|
||||||
|
this._running = false;
|
||||||
|
this.export();
|
||||||
|
}
|
||||||
|
).catch(err => {
|
||||||
|
this._running = false;
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* export result of a task to CSV
|
||||||
|
* @param {number} taskid which task id to save, begins with 0
|
||||||
|
*/
|
||||||
|
export(taskid?: number) {
|
||||||
|
let id = this._checkTaskId(taskid, this._tasks.length - 1);
|
||||||
|
if (id < 0) return;
|
||||||
|
let results = this._tasks[id].results
|
||||||
|
let count = results.data.length
|
||||||
|
if (!count) {
|
||||||
|
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
results.header = this._tasks[id].fieldSelectors;
|
||||||
|
let msg = `
|
||||||
|
Please confirm to download (${count} items):
|
||||||
|
|
||||||
|
${results.toString(50) || "- Empty -"}
|
||||||
|
`.trim();
|
||||||
|
if (confirm(msg)) {
|
||||||
|
saveFile(results.toString(), "text/csv");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private _checkTaskId(id: number, defaultId?: number) {
|
||||||
|
if (!this._tasks.length) {
|
||||||
|
logger.info("No task found.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (!isNaN(defaultId) && id === undefined) id = defaultId;
|
||||||
|
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
|
||||||
|
logger.info(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
}
|
||||||
14
src/background/index.ts
Normal file
14
src/background/index.ts
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import { Extractor } from "./extractor";
|
||||||
|
|
||||||
|
declare global {
|
||||||
|
interface Window {
|
||||||
|
$: (...args: any) => void;
|
||||||
|
Extractor: any;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
window.$ = function (...args) {
|
||||||
|
return new Extractor().task(...args).start();
|
||||||
|
}
|
||||||
|
|
||||||
|
window.Extractor = Extractor;
|
||||||
150
src/background/messaging.ts
Normal file
150
src/background/messaging.ts
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
import { Request, Actions, Response } from "../common";
|
||||||
|
import { getTabByID } from "./actions";
|
||||||
|
import { logger } from "../common/logger";
|
||||||
|
|
||||||
|
|
||||||
|
export type ResponseCheckerSync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => T;
|
||||||
|
export type ResponseCheckerAsync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => Promise<T>;
|
||||||
|
export type ResponseChecker<T> = ResponseCheckerSync<T> | ResponseCheckerAsync<T>;
|
||||||
|
/**
|
||||||
|
* Sending a message to target tab repeatedly until the response is not undefined.
|
||||||
|
* @param {object} tab the table where to send the message
|
||||||
|
* @param {object} req the request data.
|
||||||
|
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
|
||||||
|
* Check and decide what value finally returns.
|
||||||
|
* Return undefined to make 'sendMessage' retry.
|
||||||
|
* Return MSG_USER_ABORT to cancel this promise.
|
||||||
|
* @param {number} interval retry interval, default: 500ms.
|
||||||
|
* @param {number} limit retry limit, default: 0, no limit.
|
||||||
|
* @param {string} log messages logged to console.
|
||||||
|
* @return {Promise} a promise of the response.
|
||||||
|
*/
|
||||||
|
export function sendMessage<T>(
|
||||||
|
tab: chrome.tabs.Tab,
|
||||||
|
req: Request,
|
||||||
|
log?: string,
|
||||||
|
dataChecker?: ResponseChecker<T>,
|
||||||
|
timeout?: number,
|
||||||
|
interval?: number,
|
||||||
|
limit?: number
|
||||||
|
) {
|
||||||
|
timeout = timeout || 10;
|
||||||
|
interval = interval || 500;
|
||||||
|
limit = isNaN(limit) ? 0 : limit;
|
||||||
|
let count = 0;
|
||||||
|
return new Promise<T>((resolve, reject) => {
|
||||||
|
|
||||||
|
loop();
|
||||||
|
|
||||||
|
async function loop() {
|
||||||
|
logger.debug("Request for", Actions[req.action]);
|
||||||
|
let tabAvailable = await getTabByID(tab.id);
|
||||||
|
if (!tabAvailable) {
|
||||||
|
reject("Task interrupted due to the target tab is closed.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (limit && count >= limit) {
|
||||||
|
reject(`sendMessage loop limit ${limit} reached.`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
let timeout = setTimeout(() => { reject(`${Actions[req.action]} requset timeout after ${timeout}s`) }, 10000);
|
||||||
|
chrome.tabs.sendMessage(tab.id, req, async (r: Response<T>) => {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
// check error but do nothing until dataChecker.
|
||||||
|
let err = chrome.runtime.lastError;
|
||||||
|
let [result, error] = await checkResponse(dataChecker, r, err, count);
|
||||||
|
if (error) {
|
||||||
|
reject(error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let flag = result !== undefined;
|
||||||
|
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
|
||||||
|
if (flag) {
|
||||||
|
resolve(result);
|
||||||
|
} else {
|
||||||
|
setTimeout(() => {
|
||||||
|
logger.debug('Invalid response', r, 'retry...');
|
||||||
|
loop();
|
||||||
|
}, interval);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function checkResponse<T>(
|
||||||
|
dataChecker: ResponseChecker<T>,
|
||||||
|
response: Response<T>,
|
||||||
|
error: chrome.runtime.LastError,
|
||||||
|
tryCount: number
|
||||||
|
): Promise<[T, string]> {
|
||||||
|
// response could be undefined if the content script is interrupted.
|
||||||
|
// don't check, tell sendMessage to retry.
|
||||||
|
if (!response) return [undefined, undefined];
|
||||||
|
if (!dataChecker) {
|
||||||
|
return [response.result, response.error];
|
||||||
|
}
|
||||||
|
let result: T;
|
||||||
|
let pms: T | Promise<T>;
|
||||||
|
try {
|
||||||
|
pms = dataChecker(response, error, tryCount);
|
||||||
|
} catch (err) {
|
||||||
|
return [undefined, err];
|
||||||
|
}
|
||||||
|
// don't catch if it's not a Promise
|
||||||
|
if (pms instanceof Promise) {
|
||||||
|
let checkerError: any;
|
||||||
|
pms = pms.catch(e => checkerError = e);
|
||||||
|
result = await pms;
|
||||||
|
if (checkerError) {
|
||||||
|
return [undefined, checkerError];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result = pms;
|
||||||
|
}
|
||||||
|
return [result, undefined];
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ActionSubscriberSync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void;
|
||||||
|
export type ActionSubscriberAsync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => Promise<void>;
|
||||||
|
export type ActionSubscriber = ActionSubscriberSync | ActionSubscriberAsync;
|
||||||
|
|
||||||
|
class MessageSubscribers {
|
||||||
|
private listeners: { [key: number]: ActionSubscriber[] } = {};
|
||||||
|
addListener(action: Actions, subscriber: ActionSubscriber) {
|
||||||
|
this.listeners[action] || (this.listeners[action] = []);
|
||||||
|
this.listeners[action].push(subscriber);
|
||||||
|
}
|
||||||
|
removeListener(action: Actions, subscriber: ActionSubscriber) {
|
||||||
|
this.listeners[action] || (this.listeners[action] = []);
|
||||||
|
for (let i = 0; i < this.listeners[action].length; i++) {
|
||||||
|
if (this.listeners[action][i] == subscriber) {
|
||||||
|
this.listeners[action].splice(i, 1);
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
|
||||||
|
}
|
||||||
|
getListeners(action: Actions): ActionSubscriber[] {
|
||||||
|
return this.listeners[action]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
export const messageSubscribers = new MessageSubscribers();
|
||||||
|
|
||||||
|
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
|
||||||
|
let subscribers = messageSubscribers.getListeners(request.action);
|
||||||
|
if (!subscribers || !subscribers.length) {
|
||||||
|
sendResponse("Request not supported.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let promises: Promise<any>[] = [];
|
||||||
|
for (let subscriber of subscribers) {
|
||||||
|
let p = subscriber(request, sender, sendResponse);
|
||||||
|
if (p instanceof Promise) promises.push(p);
|
||||||
|
}
|
||||||
|
if (promises.length)
|
||||||
|
return Promise.all(promises);
|
||||||
|
return;
|
||||||
|
});
|
||||||
85
src/background/result.ts
Normal file
85
src/background/result.ts
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import { logger } from "../common/logger";
|
||||||
|
import { getActiveTab, ping, redirectTab } from "./actions";
|
||||||
|
import { parseUrls } from "./tools";
|
||||||
|
|
||||||
|
export class ExtractResult {
|
||||||
|
private _header: string[];
|
||||||
|
private _data: string[][] = [];
|
||||||
|
constructor(data: string[][]) {
|
||||||
|
this._data = data || [];
|
||||||
|
}
|
||||||
|
row(index: number): string[] {
|
||||||
|
return this._data[index];
|
||||||
|
}
|
||||||
|
column(index: number): string[] {
|
||||||
|
return [...new Array(this._data.length).keys()].map(
|
||||||
|
i => this._data[i][index]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
squash(): string[] {
|
||||||
|
return this._data.reduce((p, c) => p.concat(c), []);
|
||||||
|
}
|
||||||
|
set header(h: string[]) {
|
||||||
|
this._header = h
|
||||||
|
}
|
||||||
|
get data(): string[][] {
|
||||||
|
return this._data;
|
||||||
|
}
|
||||||
|
toString(rowsCount: number = 0): string {
|
||||||
|
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
|
||||||
|
if (this._header && this._header.length) {
|
||||||
|
data.unshift(this._header);
|
||||||
|
}
|
||||||
|
return data.slice().reduce(
|
||||||
|
(csv, lineCells) => {
|
||||||
|
if (!lineCells || !lineCells.length) {
|
||||||
|
return csv + "\n";
|
||||||
|
}
|
||||||
|
let line = lineCells.reduce(
|
||||||
|
(lineText, cell, idx) => {
|
||||||
|
cell = cell || "";
|
||||||
|
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
|
||||||
|
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
|
||||||
|
}, "");
|
||||||
|
return csv + line + "\n";
|
||||||
|
},
|
||||||
|
""
|
||||||
|
);
|
||||||
|
}
|
||||||
|
async walk(fn: (row: number, col: number, value: string) => void) {
|
||||||
|
let pms = Promise.resolve(null);
|
||||||
|
for (let i = 0; i < this._data.length; i++) {
|
||||||
|
let cells = this._data[i];
|
||||||
|
for (let j = 0; j < cells.length; j++) {
|
||||||
|
let row = i;
|
||||||
|
let col = j;
|
||||||
|
let value = cells[j];
|
||||||
|
pms = pms.then(
|
||||||
|
() => fn(row, col, value)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pms.catch(err => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
async visit() {
|
||||||
|
let urls = parseUrls(this);
|
||||||
|
let tab = await getActiveTab(true) || await getActiveTab(false);
|
||||||
|
let succ = await ping(tab);
|
||||||
|
if (!succ) {
|
||||||
|
logger.error('Cannot contact with active tab.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
return urls.reduce(
|
||||||
|
(pms, url: string, i: number) => {
|
||||||
|
return pms.then(
|
||||||
|
async () => {
|
||||||
|
return redirectTab(tab, url, false);
|
||||||
|
});
|
||||||
|
}, Promise.resolve<void>(undefined)
|
||||||
|
).catch(err => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,4 +1,6 @@
|
|||||||
const signitures = `
|
import { ExtractResult } from "./result";
|
||||||
|
|
||||||
|
export const signitures = `
|
||||||
## Usage
|
## Usage
|
||||||
// single task
|
// single task
|
||||||
$(...args);
|
$(...args);
|
||||||
@ -18,12 +20,13 @@ function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
|
|||||||
$(".item", ["a", "a@href"]);
|
$(".item", ["a", "a@href"]);
|
||||||
|
|
||||||
## See Detailed Help:
|
## See Detailed Help:
|
||||||
https://git.jebbs.co/jebbs/data-extracter-extesion
|
https://git.qjebbs.com/jebbs/data-extracter-extesion
|
||||||
`.trim();
|
`.trim();
|
||||||
|
|
||||||
function testArgs(...args) {
|
export function testArgs(...args: any) {
|
||||||
switch (args.length) {
|
switch (args.length) {
|
||||||
case 0, 1:
|
case 0:
|
||||||
|
case 1:
|
||||||
return false;
|
return false;
|
||||||
case 2:
|
case 2:
|
||||||
return args[0] && args[1] &&
|
return args[0] && args[1] &&
|
||||||
@ -66,7 +69,3 @@ function testArgs(...args) {
|
|||||||
return arr.reduce((p, c) => p && tester(c), true);
|
return arr.reduce((p, c) => p && tester(c), true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function argsToString(...args) {
|
|
||||||
return args.map(v => (v instanceof Array ? `[${v.join(', ')}]` : v.toString())).join(', ');
|
|
||||||
}
|
|
||||||
178
src/background/task.ts
Normal file
178
src/background/task.ts
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
import { parseUrls } from "./tools";
|
||||||
|
import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
|
||||||
|
import { testArgs, signitures } from "./signiture";
|
||||||
|
import { ExtractResult } from "./result";
|
||||||
|
import { messageSubscribers, ActionSubscriber } from "./messaging";
|
||||||
|
import { Actions } from "../common";
|
||||||
|
import { logger } from "../common/logger";
|
||||||
|
|
||||||
|
export class Task {
|
||||||
|
private _data: { [key: string]: string[][] } = {};
|
||||||
|
private _data_keys: string[] = [];
|
||||||
|
private _options: any;
|
||||||
|
private _itemsSelector: string;
|
||||||
|
private _fieldSelectors: string[];
|
||||||
|
private _urls: string[] = [];
|
||||||
|
private _running = false;
|
||||||
|
private _listeners: ActionSubscriber[] = [];
|
||||||
|
|
||||||
|
constructor(options: any, ...arg: any);
|
||||||
|
constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
|
||||||
|
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, from: number, to: number, interval: number);
|
||||||
|
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, pages: number[]);
|
||||||
|
constructor(options: any, itemsSelector: string, fieldSelectors: string[], urls: string[]);
|
||||||
|
constructor(options, ...args) {
|
||||||
|
if (!testArgs(...args))
|
||||||
|
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
|
||||||
|
this._options = options;
|
||||||
|
this._itemsSelector = args.shift();
|
||||||
|
this._fieldSelectors = args.shift();
|
||||||
|
this._urls = parseUrls(...args);
|
||||||
|
}
|
||||||
|
load(state: any): Task {
|
||||||
|
this._itemsSelector = state._itemsSelector;
|
||||||
|
this._data = state._data;
|
||||||
|
this._data_keys = state._data_keys;
|
||||||
|
this._itemsSelector = state._itemsSelector;
|
||||||
|
this._fieldSelectors = state._fieldSelectors;
|
||||||
|
this._urls = state._urls;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
get urls(): string[] {
|
||||||
|
return this._urls;
|
||||||
|
}
|
||||||
|
get results(): ExtractResult {
|
||||||
|
let rs: string[][] = this._data_keys.reduce((p, c) => {
|
||||||
|
return p.concat(this._data[c]);
|
||||||
|
}, []);
|
||||||
|
return new ExtractResult(rs);
|
||||||
|
}
|
||||||
|
get fieldSelectors(): string[] {
|
||||||
|
return this._fieldSelectors;
|
||||||
|
}
|
||||||
|
clean(): Task {
|
||||||
|
this.stop();
|
||||||
|
this._data = {};
|
||||||
|
this._data_keys = [];
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
stop() {
|
||||||
|
this._running = false;
|
||||||
|
let listener: ActionSubscriber;
|
||||||
|
while (listener = this._listeners.pop()) {
|
||||||
|
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
async watch() {
|
||||||
|
if (this._running) {
|
||||||
|
logger.info("The task is running. Please wait...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this._running = true;
|
||||||
|
let window = await findIncognitoWindow() || await getCurrentWindow();
|
||||||
|
if (!window) {
|
||||||
|
logger.info("No window to watch...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let watchTaskID = 0;
|
||||||
|
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
|
||||||
|
let findWindow = await getWindowByID(window.id);
|
||||||
|
if (!findWindow) {
|
||||||
|
// stop watch on window close.
|
||||||
|
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// only watch current window.
|
||||||
|
if (sender.tab.windowId != window.id) return;
|
||||||
|
let taskID = watchTaskID++;
|
||||||
|
logger.info(`Watcher #${taskID} starts.`);
|
||||||
|
let pm = this.makeOptionalTasks(sender.tab);
|
||||||
|
return pm.then(
|
||||||
|
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, sender.tab.url, true)
|
||||||
|
).then(
|
||||||
|
results => {
|
||||||
|
if (results && results.length) {
|
||||||
|
this.saveResult(results, sender.tab.url);
|
||||||
|
}
|
||||||
|
logger.info(`Watcher #${taskID} ends.`);
|
||||||
|
}
|
||||||
|
).catch(
|
||||||
|
e => logger.error(`Watcher #${taskID} ends with:`, e)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
this._listeners.push(listener);
|
||||||
|
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
|
||||||
|
}
|
||||||
|
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
|
||||||
|
if (!tab) throw "No tab to execute the task.";
|
||||||
|
if (this._running) throw "The task is running. Please wait...";
|
||||||
|
this._running = true;
|
||||||
|
let urls = this._urls
|
||||||
|
if (!urls.length) {
|
||||||
|
if (upstreamData) {
|
||||||
|
urls = parseUrls(upstreamData);
|
||||||
|
} else {
|
||||||
|
let tabURL: string;
|
||||||
|
await queryUrl(tab)
|
||||||
|
.then(u => {
|
||||||
|
tabURL = u;
|
||||||
|
})
|
||||||
|
.catch(() => {
|
||||||
|
e => {
|
||||||
|
this._running = false;
|
||||||
|
return Promise.reject(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
urls = [tabURL];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return urls.reduce((p, url, i) => p.then(
|
||||||
|
results => {
|
||||||
|
if (i > 0 && results instanceof Array) {
|
||||||
|
let lastURL = urls[i - 1];
|
||||||
|
this.saveResult(results, lastURL);
|
||||||
|
}
|
||||||
|
if (this._data[url]) return;
|
||||||
|
|
||||||
|
let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
|
||||||
|
return pms
|
||||||
|
.then(() => this.makeOptionalTasks(tab))
|
||||||
|
.then(
|
||||||
|
() => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
), Promise.resolve<string[][]>(null)).then(
|
||||||
|
results => {
|
||||||
|
if (results && results.length) {
|
||||||
|
let lastURL = urls[urls.length - 1];
|
||||||
|
this.saveResult(results, lastURL);
|
||||||
|
}
|
||||||
|
this._running = false;
|
||||||
|
}
|
||||||
|
).catch(
|
||||||
|
e => {
|
||||||
|
this._running = false;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
|
||||||
|
let pm: Promise<any>;
|
||||||
|
if (this._options["scrollToBottom"]) {
|
||||||
|
pm = this.runningCheck(() => scrollToBottom(tab));
|
||||||
|
}
|
||||||
|
return pm;
|
||||||
|
}
|
||||||
|
private runningCheck(fn: () => Promise<any>): Promise<any> {
|
||||||
|
if (!this._running) throw "The task is stopped by user.";
|
||||||
|
return fn();
|
||||||
|
}
|
||||||
|
private saveResult(results, key) {
|
||||||
|
if (this._data[key] === undefined) {
|
||||||
|
// do not add keys again
|
||||||
|
this._data_keys.push(key);
|
||||||
|
}
|
||||||
|
this._data[key] = results;
|
||||||
|
logger.info(`${results.length} items found.`)
|
||||||
|
}
|
||||||
|
}
|
||||||
62
src/background/tools.ts
Normal file
62
src/background/tools.ts
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import { ExtractResult } from "./result";
|
||||||
|
|
||||||
|
const URL_REG = /^\s*(https?):\/\//im;
|
||||||
|
|
||||||
|
export function parseUrls(...args): string[] {
|
||||||
|
if (!args.length) return [];
|
||||||
|
let arg = args.shift();
|
||||||
|
if (arg instanceof Array) {
|
||||||
|
return arg;
|
||||||
|
} else if (arg instanceof ExtractResult) {
|
||||||
|
return arg.squash().filter(v => URL_REG.test(v));
|
||||||
|
} else {
|
||||||
|
let urlTempl = arg;
|
||||||
|
if (urlTempl) {
|
||||||
|
if (args[0] instanceof Array) {
|
||||||
|
return args[0].map(p => urlTempl.replace("${page}", p));
|
||||||
|
} else if (args.length >= 3) {
|
||||||
|
let urls = [];
|
||||||
|
let from = args.shift();
|
||||||
|
let to = args.shift();
|
||||||
|
let interval = args.shift();
|
||||||
|
for (let i = from; i <= to; i += interval) {
|
||||||
|
urls.push(urlTempl.replace("${page}", i));
|
||||||
|
}
|
||||||
|
return urls;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function saveFile(data: string, mimeType: string, fileName?: string) {
|
||||||
|
fileName = fileName || document.title || "result";
|
||||||
|
let blob: Blob;
|
||||||
|
if (typeof window.Blob == "function") {
|
||||||
|
blob = new Blob([data], {
|
||||||
|
type: mimeType
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
var BlobBuiler = window.MSBlobBuilder;
|
||||||
|
var builer = new BlobBuiler();
|
||||||
|
builer.append(data);
|
||||||
|
blob = builer.getBlob(mimeType)
|
||||||
|
}
|
||||||
|
var URL = window.URL || window.webkitURL;
|
||||||
|
var url = URL.createObjectURL(blob);
|
||||||
|
var link = document.createElement("a");
|
||||||
|
if ('download' in link) {
|
||||||
|
link.style.visibility = "hidden";
|
||||||
|
link.href = url;
|
||||||
|
link.download = fileName;
|
||||||
|
document.body.appendChild(link);
|
||||||
|
var j = document.createEvent("MouseEvents");
|
||||||
|
j.initEvent("click", true, true);
|
||||||
|
link.dispatchEvent(j);
|
||||||
|
document.body.removeChild(link)
|
||||||
|
} else if (navigator.msSaveBlob) {
|
||||||
|
navigator.msSaveBlob(blob, fileName)
|
||||||
|
} else {
|
||||||
|
location.href = url
|
||||||
|
}
|
||||||
|
}
|
||||||
28
src/common/index.ts
Normal file
28
src/common/index.ts
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
export enum Actions {
|
||||||
|
// from background to content script
|
||||||
|
EXTRACT = 1,
|
||||||
|
GOTO_URL,
|
||||||
|
PING,
|
||||||
|
QUERY_URL,
|
||||||
|
SCROLL_BOTTOM,
|
||||||
|
SLEEP,
|
||||||
|
WAKEUP,
|
||||||
|
// from popup to background script
|
||||||
|
UPLOAD_STATE,
|
||||||
|
// from content to background script
|
||||||
|
REPORT_NEW_PAGE,
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Request {
|
||||||
|
action: Actions
|
||||||
|
itemsSelector?: string
|
||||||
|
fieldSelectors?: string[]
|
||||||
|
url?: string
|
||||||
|
fileName?: string
|
||||||
|
state?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Response<T> {
|
||||||
|
result: T;
|
||||||
|
error: string;
|
||||||
|
}
|
||||||
75
src/common/logger.ts
Normal file
75
src/common/logger.ts
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
export enum LOGGER_LEVEL {
|
||||||
|
DEBUG = 1,
|
||||||
|
INFO,
|
||||||
|
WARN,
|
||||||
|
ERROR,
|
||||||
|
DISABLED,
|
||||||
|
};
|
||||||
|
|
||||||
|
export class Logger {
|
||||||
|
private _notificationId = undefined;
|
||||||
|
private _log_level = LOGGER_LEVEL.INFO;
|
||||||
|
private _notify_level = LOGGER_LEVEL.ERROR;
|
||||||
|
constructor(logLevel, notifyLevel) {
|
||||||
|
if (logLevel) this._log_level = logLevel;
|
||||||
|
if (notifyLevel) this._notify_level = notifyLevel;
|
||||||
|
if (chrome.notifications) chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
|
||||||
|
}
|
||||||
|
get logLevel() {
|
||||||
|
return this._log_level;
|
||||||
|
}
|
||||||
|
set logLevel(val: LOGGER_LEVEL) {
|
||||||
|
this._log_level = val;
|
||||||
|
}
|
||||||
|
get notifyLevel() {
|
||||||
|
return this._notify_level;
|
||||||
|
}
|
||||||
|
set notifyLevel(val: LOGGER_LEVEL) {
|
||||||
|
this._notify_level = val;
|
||||||
|
}
|
||||||
|
log(level: LOGGER_LEVEL, loggerFn: Function, ...msgs) {
|
||||||
|
if (level < this._log_level) return;
|
||||||
|
let time = new Date().toLocaleString();
|
||||||
|
loggerFn(`${time} [${LOGGER_LEVEL[level]}]`, ...msgs);
|
||||||
|
if (level < this._notify_level) return;
|
||||||
|
this.notify(...msgs);
|
||||||
|
}
|
||||||
|
debug(...msgs) {
|
||||||
|
this.log(LOGGER_LEVEL.DEBUG, console.debug, ...msgs);
|
||||||
|
}
|
||||||
|
info(...msgs) {
|
||||||
|
this.log(LOGGER_LEVEL.INFO, console.info, ...msgs);
|
||||||
|
}
|
||||||
|
warn(...msgs) {
|
||||||
|
this.log(LOGGER_LEVEL.WARN, console.info, ...msgs);
|
||||||
|
}
|
||||||
|
error(...msgs) {
|
||||||
|
this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs);
|
||||||
|
}
|
||||||
|
notify(...msgs) {
|
||||||
|
let msg = msgs.join(' ');
|
||||||
|
if (!this._notificationId) {
|
||||||
|
chrome.notifications.create(
|
||||||
|
null,
|
||||||
|
{
|
||||||
|
"type": "basic",
|
||||||
|
"iconUrl": chrome.extension.getURL('icon.png'),
|
||||||
|
"title": "Data Extractor",
|
||||||
|
"message": msg,
|
||||||
|
"priority": 0,
|
||||||
|
"requireInteraction": true,
|
||||||
|
},
|
||||||
|
notificationId => {
|
||||||
|
this._notificationId = notificationId;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
chrome.notifications.update(
|
||||||
|
this._notificationId,
|
||||||
|
{ "message": msg }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);
|
||||||
101
src/content/actions.ts
Normal file
101
src/content/actions.ts
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
import { logger } from "../common/logger";
|
||||||
|
|
||||||
|
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
|
||||||
|
if (expectedURL && location.href != expectedURL) {
|
||||||
|
throw 'Target tab URL changed, aborting...';
|
||||||
|
}
|
||||||
|
// since some elements may be loaded asynchronously.
|
||||||
|
// if one field is never found, we should return undefined,
|
||||||
|
// so that senders can detect to retry until elements loaded.
|
||||||
|
// If user writes wrong selectors, the task retries infinitely.
|
||||||
|
let fieldFound: { [key: string]: boolean } = {};
|
||||||
|
let items: Element[] = Array.from(document.querySelectorAll(itemsSelector));
|
||||||
|
// items may not loaded yet, tell the sender to retry.
|
||||||
|
if (!items.length) return [];
|
||||||
|
let results: string[][] = items.map(
|
||||||
|
item => {
|
||||||
|
return fieldSelectors.map(
|
||||||
|
selector => {
|
||||||
|
let doClick = false;
|
||||||
|
if (selector.startsWith("!")) {
|
||||||
|
doClick = true;
|
||||||
|
selector = selector.substring(1);
|
||||||
|
}
|
||||||
|
let [cls, attr] = selector.split('@').slice(0, 2);
|
||||||
|
let fieldElements: Element[];
|
||||||
|
cls = cls.trim()
|
||||||
|
if (cls != "") {
|
||||||
|
fieldElements = Array.from(item.querySelectorAll(cls));
|
||||||
|
} else {
|
||||||
|
fieldElements = [item];
|
||||||
|
}
|
||||||
|
if (!fieldElements.length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
fieldFound[selector] = true;
|
||||||
|
return fieldElements.map(find => {
|
||||||
|
if (doClick) {
|
||||||
|
let e = document.createEvent("MouseEvents");
|
||||||
|
e.initEvent("click", true, true);
|
||||||
|
find.dispatchEvent(e);
|
||||||
|
}
|
||||||
|
return attr ? find[attr] : find.textContent.trim();
|
||||||
|
}).join('\n')
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
);
|
||||||
|
// TODO: configurable wait logic
|
||||||
|
// if it exists a field, which is not found in any row, the sender should retry.
|
||||||
|
let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
|
||||||
|
let shouldWait = notFoundFields.length > 0;
|
||||||
|
if (shouldWait) {
|
||||||
|
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
|
||||||
|
}
|
||||||
|
return shouldWait ? [] : results;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function scrollToBottom() {
|
||||||
|
return executeUntil(
|
||||||
|
() => window.scrollTo(0, document.body.clientHeight),
|
||||||
|
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
|
||||||
|
"Scroll to page bottom...",
|
||||||
|
1000,
|
||||||
|
10
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Repeatedly execute an function until the the detector returns true.
|
||||||
|
* @param {object} fn the function to execute
|
||||||
|
* @param {object} detector the detector.
|
||||||
|
* @param {string} log messages logged to console.
|
||||||
|
* @param {number} interval interval for detecting
|
||||||
|
* @param {number} limit max execute times of a function
|
||||||
|
* @return {Promise} a promise of the response.
|
||||||
|
*/
|
||||||
|
function executeUntil(fn: () => void, detector: () => boolean, log: string, interval: number, limit: number) {
|
||||||
|
interval = interval || 500;
|
||||||
|
let count = 0;
|
||||||
|
return new Promise<boolean>((resolve, reject) => {
|
||||||
|
|
||||||
|
loop();
|
||||||
|
|
||||||
|
async function loop() {
|
||||||
|
fn();
|
||||||
|
limit++;
|
||||||
|
if (limit && count >= limit) {
|
||||||
|
reject(false);
|
||||||
|
}
|
||||||
|
setTimeout(() => {
|
||||||
|
let flag = !detector || detector();
|
||||||
|
if (log) console.log(log, flag ? '(OK)' : '(failed)');
|
||||||
|
if (flag) {
|
||||||
|
resolve(true);
|
||||||
|
} else {
|
||||||
|
loop();
|
||||||
|
}
|
||||||
|
}, interval);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
75
src/content/index.ts
Normal file
75
src/content/index.ts
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import { Request, Actions, Response } from '../common';
|
||||||
|
import { scrollToBottom, extract } from './actions';
|
||||||
|
|
||||||
|
let asleep = false;
|
||||||
|
chrome.runtime.onMessage.addListener(
|
||||||
|
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
|
||||||
|
if (!request.action) return;
|
||||||
|
if (asleep && Actions.WAKEUP != request.action) {
|
||||||
|
sendResponse && sendResponse(undefined);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// console.log("Recieved request:",request);
|
||||||
|
doAction(request, sender).then(r => sendResponse && sendResponse(r));
|
||||||
|
// return true to indicate you wish to send a response asynchronously
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
chrome.runtime.sendMessage(<Request>{
|
||||||
|
action: Actions.REPORT_NEW_PAGE,
|
||||||
|
});
|
||||||
|
|
||||||
|
async function doAction(request: Request, sender: chrome.runtime.MessageSender): Promise<Response<any>> {
|
||||||
|
let result: any;
|
||||||
|
let error: string;
|
||||||
|
try {
|
||||||
|
switch (request.action) {
|
||||||
|
case Actions.EXTRACT:
|
||||||
|
result = extract(request.itemsSelector, request.fieldSelectors, request.url);
|
||||||
|
break;
|
||||||
|
case Actions.GOTO_URL:
|
||||||
|
window.location.replace(request.url);
|
||||||
|
// should not recieve any request until the page & script reload
|
||||||
|
asleep = true;
|
||||||
|
result = request.url;
|
||||||
|
break;
|
||||||
|
case Actions.PING:
|
||||||
|
result = "pong";
|
||||||
|
break;
|
||||||
|
case Actions.QUERY_URL:
|
||||||
|
result = window.location.href;
|
||||||
|
break;
|
||||||
|
case Actions.SCROLL_BOTTOM:
|
||||||
|
result = scrollToBottom();
|
||||||
|
break;
|
||||||
|
case Actions.SLEEP:
|
||||||
|
asleep = true;
|
||||||
|
result = "Content script is sleeping.";
|
||||||
|
break;
|
||||||
|
case Actions.WAKEUP:
|
||||||
|
asleep = false;
|
||||||
|
result = "Content script is available.";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
error = 'Unsupported action.'
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof Error) {
|
||||||
|
error = err.message;
|
||||||
|
} else {
|
||||||
|
error = err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newResponse(result, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function newResponse<T>(result: T, err?: string): Response<T> {
|
||||||
|
let r: Response<T> = {
|
||||||
|
result: result,
|
||||||
|
error: err,
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
34
src/popup/index.ts
Normal file
34
src/popup/index.ts
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import { Request, Actions } from '../common';
|
||||||
|
|
||||||
|
window.onload = function () {
|
||||||
|
document.querySelector('#link-extension-detail')
|
||||||
|
.addEventListener('click', () => {
|
||||||
|
chrome.tabs.create({
|
||||||
|
'url': `chrome://extensions/?id=${chrome.runtime.id}`
|
||||||
|
});
|
||||||
|
})
|
||||||
|
document.querySelector('#link-document')
|
||||||
|
.addEventListener('click', () => {
|
||||||
|
chrome.tabs.create({
|
||||||
|
'url': `https://git.qjebbs.com/jebbs/data-extracter-extesion`
|
||||||
|
});
|
||||||
|
})
|
||||||
|
document.querySelector('#state-input')
|
||||||
|
.addEventListener('change', function (...args) {
|
||||||
|
if (this.files.length == 1) {
|
||||||
|
var reader = new FileReader();
|
||||||
|
let fileName = this.files[0].name;
|
||||||
|
reader.readAsText(this.files[0], "UTF-8");
|
||||||
|
reader.onload = function (evt) {
|
||||||
|
var fileString = evt.target.result;
|
||||||
|
chrome.runtime.sendMessage(<Request>{
|
||||||
|
action: Actions.UPLOAD_STATE,
|
||||||
|
state: fileString,
|
||||||
|
fileName: fileName
|
||||||
|
}, r => {
|
||||||
|
if (r) console.log('State sent:', r);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Before Width: | Height: | Size: 36 KiB After Width: | Height: | Size: 36 KiB |
@ -3,9 +3,9 @@
|
|||||||
<link>
|
<link>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<title>Data Extractor</title>
|
<title>Data Extractor</title>
|
||||||
<script charset="UTF-8" type="text/javascript" src="tip.js"></script>
|
<script charset="UTF-8" type="text/javascript" src="../scripts/popup.bundle.js"></script>
|
||||||
|
|
||||||
<link rel="stylesheet" href="styles/bootstrap.min.css">
|
<link rel="stylesheet" href="../assets/bootstrap.min.css">
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body style="margin: 20px 10px;">
|
<body style="margin: 20px 10px;">
|
||||||
@ -18,13 +18,12 @@
|
|||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col">
|
<div class="col">
|
||||||
<div class="alert alert-info small">
|
<div class="alert alert-info small">
|
||||||
<!-- <h6>Usage:</h6> -->
|
|
||||||
<p>
|
<p>
|
||||||
Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page",
|
Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page",
|
||||||
and type your scripts in the console.
|
and type your scripts in the console.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<img src="../images/console.png" alt=""
|
<img src="../assets/console.png" alt=""
|
||||||
style="max-width: 489px; width: 100%; border-radius: 5px">
|
style="max-width: 489px; width: 100%; border-radius: 5px">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -32,7 +31,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
|
|
||||||
<div class="col">
|
<div class="col">
|
||||||
<h6>Quick Start</h6>
|
<h6>Quick Start</h6>
|
||||||
</div>
|
</div>
|
||||||
@ -42,22 +40,33 @@
|
|||||||
<div class="alert alert-success small">
|
<div class="alert alert-success small">
|
||||||
<p>
|
<p>
|
||||||
<b>Extract current page</b>:
|
<b>Extract current page</b>:
|
||||||
<br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
|
<br>> $(".list-item", ["a.title", "p.content"]);
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<b>Extract multiple pages (1-10, interval 1)</b>:
|
<b>Extract multiple pages (1-10, interval 1)</b>:
|
||||||
<br>new Extractor().task(".list-item", ["a.title", "p.content"],
|
<br>> job=new Extractor().task(".list-item", ["a.title", "p.content"],
|
||||||
"http://sample.com/?pn=${page}", 1, 10, 1).start();
|
"http://sample.com/?pn=${page}", 1, 10, 1);
|
||||||
|
<br>> job.start();
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<b>Full document:</b>
|
<b>Full document at:</b>
|
||||||
<br>
|
<br>
|
||||||
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
|
<a href="#" id="link-document">https://git.qjebbs.com/jebbs/data-extracter-extesion</a>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col">
|
||||||
|
<h6>Saved State</h6>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col">
|
||||||
|
<input type="file" name="state" id="state-input">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 4.1 KiB After Width: | Height: | Size: 4.1 KiB |
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"manifest_version": 2,
|
"manifest_version": 2,
|
||||||
"name": "Data Extracter",
|
"name": "Data Extracter",
|
||||||
"version": "0.1.0",
|
"version": "0.5.1",
|
||||||
"author": "jebbs",
|
"author": "jebbs",
|
||||||
"description": "Extract data from web page elements as sheet.",
|
"description": "Extract data from web page elements as sheet.",
|
||||||
"icons": {
|
"icons": {
|
||||||
@ -11,32 +11,25 @@
|
|||||||
},
|
},
|
||||||
"browser_action": {
|
"browser_action": {
|
||||||
"default_icon": "icon.png",
|
"default_icon": "icon.png",
|
||||||
"default_popup": "popup/tip.html",
|
"default_popup": "html/popup.html",
|
||||||
"default_title": "Data Extracter"
|
"default_title": "Data Extracter"
|
||||||
},
|
},
|
||||||
"background": {
|
"background": {
|
||||||
"scripts": [
|
"scripts": [
|
||||||
"scripts/shared/tools.js",
|
"scripts/background.bundle.js"
|
||||||
"scripts/shared/common.js",
|
|
||||||
"scripts/background/messaging.js",
|
|
||||||
"scripts/background/result.js",
|
|
||||||
"scripts/background/signiture.js",
|
|
||||||
"scripts/background/actions.js",
|
|
||||||
"scripts/background/extractor.js",
|
|
||||||
"scripts/background/helpers.js"
|
|
||||||
],
|
],
|
||||||
"persistent": false
|
"persistent": false
|
||||||
},
|
},
|
||||||
"content_scripts": [{
|
"content_scripts": [{
|
||||||
"matches": ["*://*/*"],
|
"matches": ["*://*/*"],
|
||||||
"js": [
|
"js": [
|
||||||
"scripts/shared/tools.js",
|
"scripts/content.bundle.js"
|
||||||
"scripts/shared/common.js",
|
|
||||||
"scripts/content/content.js"
|
|
||||||
],
|
],
|
||||||
"run_at": "document_idle"
|
"run_at": "document_idle"
|
||||||
}],
|
}],
|
||||||
|
"incognito": "spanning",
|
||||||
"permissions": [
|
"permissions": [
|
||||||
"activeTab"
|
"activeTab",
|
||||||
|
"notifications"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
12
tsconfig.json
Normal file
12
tsconfig.json
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"module": "commonjs",
|
||||||
|
"target": "es6",
|
||||||
|
"noImplicitAny": false,
|
||||||
|
"sourceMap": true,
|
||||||
|
"rootDir": "src",
|
||||||
|
"outDir": "dist/js",
|
||||||
|
"noEmitOnError": true,
|
||||||
|
"typeRoots": [ "node_modules/@types" ]
|
||||||
|
}
|
||||||
|
}
|
||||||
33
webpack.config.js
Normal file
33
webpack.config.js
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
const path = require('path');
|
||||||
|
const CopyPlugin = require('copy-webpack-plugin');
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
mode: 'production',
|
||||||
|
entry: {
|
||||||
|
background: './src/background/index.ts',
|
||||||
|
content: './src/content/index.ts',
|
||||||
|
popup: './src/popup/index.ts',
|
||||||
|
},
|
||||||
|
// devtool: 'inline-source-map',
|
||||||
|
output: {
|
||||||
|
path: path.resolve(__dirname, 'dist'),
|
||||||
|
filename: 'scripts/[name].bundle.js'
|
||||||
|
},
|
||||||
|
module: {
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
test: /\.tsx?$/,
|
||||||
|
use: 'ts-loader',
|
||||||
|
exclude: /node_modules/
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
resolve: {
|
||||||
|
extensions: ['.tsx', '.ts', '.js']
|
||||||
|
},
|
||||||
|
plugins: [
|
||||||
|
new CopyPlugin([
|
||||||
|
{ from: '**/*', to: '.', toType: "dir" },
|
||||||
|
], { context: 'template', logLevel: 'warn' }),
|
||||||
|
]
|
||||||
|
};
|
||||||
Reference in New Issue
Block a user