這篇文章主要介紹了nodejs通過phantomjs實現下載網頁的方法,有需要的小伙伴可以參考下。
功能其實很見簡單,通過 phantomjs.exe 采集 url 加載的資源,通過子進程的方式,啟動nodejs 加載所有的資源,對于css的資源,匹配css內容,下載里面的url資源
當然功能還是很簡單的,在響應式設計和異步加載的情況下,還是有很多資源沒有能夠下載,需要根據實際情況處理下
首先當然是下載 nodejs 和 phantomjs
下面是 phantomjs.exe 執行的 down.js
- var page = require('webpage').create(),
- system = require('system');
- var spawn = require("child_process").spawn
- if (system.args.length === 1) {
- console.log('Usage: netsniff.js <some URL>');
- phantom.exit(1);
- } else {
- var urls = [];
- page.address = system.args[1];
- page.onResourceReceived = function (res) {
- if (res.stage === 'start') {
- urls.push(res.url);
- }
- };
- page.open(page.address, function (status) {
- var har;
- if (status !== 'success') {
- console.log('FAIL to load the address');
- phantom.exit(1);
- } else {
- console.log('down resource ' + urls.length + ' urls.');
- var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])
- child.stdout.on("data", function (data) {
- console.log(data);
- })
- child.stderr.on("data", function (data) {
- console.log(data);
- })
- child.on("exit", function (code) {
- phantom.exit();
- })
- }
- });
- }
下面是對應的node運行的 downHtml.js
- "use strict";
- var fs = require('fs');
- var http = require('http');
- var path = require('path');
- var r_url = require('url');
- var dirCache = {};//緩存減少判斷
- function makedir (pathStr, callback) {
- if (dirCache[pathStr] == 1) {
- callback();
- } else {
- fs.exists(pathStr, function (exists) {
- if (exists == true) {
- dirCache[pathStr] == 1;
- callback();
- } else {
- makedir(path.dirname(pathStr), function () {
- fs.mkdir(pathStr, function () {
- dirCache[pathStr] == 1;
- callback();
- })
- });
- }
- })
- }
- };
- var reg = /[:,]/s*url/(['"]?.*?(/1)/)/g
- var reg2 = //((['"]?)(.*?)(/1)/)/
- var isDownMap = {};
- var downImgFromCss = function (URL) {
- http.get(URL, function(res) {
- //console.log(path.resolve(process.cwd(), 'index.min.css'))
- //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));
- var body = "";
- res.setEncoding('utf8');
- res.on('data', function (chunk) {
- body += chunk;
- });
- res.on('end', function () {
- var match = body.match(reg);
- for (var i = 0, len = match.length; i < len; i++){
- var m = match[i].match(reg2);
- if (m && m[2]) {
- var url = m[2];
- let imgUrl = r_url.resolve(URL, url);
- if (!isDownMap[imgUrl]) {
- var uo = r_url.parse(imgUrl);
- let filepath = CWD + '/' + uo.hostname + uo.pathname;
- makedir(path.dirname(filepath), function () {
- http.get(imgUrl, function (res) {
- res.pipe(fs.createWriteStream(filepath));
- })
- })
- isDownMap[imgUrl] = 1;
- }
- }
- }
- });
- });
- }
- var URLS = process.argv[2].split(',');
- var CWD = process.cwd();
- //下載資源
- URLS.forEach(function (URL) {
- var uo = r_url.parse(URL);
- var filepath;
- if (uo.pathname == '/' || uo.pathname == '') {
- filepath = CWD + '/' + uo.hostname + '/index.html';
- } else {
- filepath = CWD + '/' + uo.hostname + uo.pathname;
- }
- makedir(path.dirname(filepath), function () {
- http.get(URL, function (res) {
- if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {
- console.log('down images form css file:' + URL + '.');
- downImgFromCss(URL);
- }
- res.pipe(fs.createWriteStream(filepath));
- })
- });
- });
down.js downHtml.js 放在同一個文件夾下 通過下列 cmd 運行
D:/phantomjs-2.0.0-windows/bin/phantomjs.exe down.jshttp://www.youku.com/
以上所述就是本文的全部內容了,希望大家能夠喜歡。
新聞熱點
疑難解答
圖片精選