参考地址 Puppeteer拦截某条url并返回其响应内容(场景和方法) API RequestInterception拦截器的使用
page.setRequestInterception(true)拦截器的使用方法和场景
现附上Puppeteer的Api的链接https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md
实用场景(没错就是实用):比如我用Puppeteer模拟某个网页,然后我只想抓到这条网页的url的response的内容,或者我需要截图或者生成PDF但是只要文件我就可以过滤掉后缀是图片的url
使用的api:定位到api的链接https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#class-request
主要是class: Request 和 class: Response 两大块相结合
官方例子参考1:
await page.setRequestInterception(true);
page.on('request', request => {
request.respond({
status: 404,
contentType: 'text/plain',
body: 'Not Found!'
});
});
实际使用的例子参考1:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
ignoreHTTPSErrors: true,
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
}).catch(() => browser.close);
const page = await browser.newPage();
let response = await page.goto('http://www.google.com');
response.json();// 将response.body 转成json。
console.log(response);
await page.setRequestInterception(true);
page.on('requestfailed', request => {
console.log(request.url() + ' ' + request.failure().errorText);
});
// response.ok(); // 返回一个boolean值 如果状态码为200-299则为true, 其他则为false.
// response.status; // 返回状态码
// response.text(); // 返回 response body.
// response.headers // 返回 HTTP headers
await browser.close();
})();
实际参考例子2:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch({
ignoreHTTPSErrors: true,
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
}).catch(() => browser.close);
const page = await browser.newPage();
await page.setRequestInterception(true);
var num = 0;
await page.on('request', request => {
//这就是请求的类型如果是图片类型的话执行abort拦截操作 否则continue继续请求别的
if (request.resourceType() === 'image') {
console.log(num + "image: ");
let res = request.response();
console.log(request.url);
console.log(res);
num++;
request.abort();
} else {
// request.respond({
// status: 200,
// contentType: 'text/plain',
// body: 'GOOD!'
// });
console.log("continue")
request.continue();
}
});
// page.on('request', request => {
// if (request.resourceType() === 'image')
// request.abort();
// else
// request.continue();
// });
await page.goto('https://news.google.com/news/');
await page.screenshot({path: 'news.png', fullPage: true});
await browser.close();
} catch (e) {
console.log(e);
}
})();
基本代码注释也很清楚也很容易理解,这是两个比较常用的例子,接下来是实战中更加常用的实用性例子
实际参考例子3(重点):
/**
* 获取拦截某条url内容的
* @param page
* @returns {Promise<any | never>}
*/
async function getResponseMsg(page) {
return new Promise((resolve, reject) => {
page.on('request', request => {
if (request.url() === 'https://test.do') {
console.log(request.url());
console.log("拦截到了这条url然后就该请求了");
page.on('response', response => {
if (response.url() === 'https://test.do') {
const req = response.request();
console.log("Response 的:" + req.method, response.status, req.url);
let message = response.text();
message.then(function (result1) {
results = result1;
resolve(results);
});
}
});
request.continue();
}
else {
console.log(request.url());
console.log("continue");
request.continue();
}
});
}).catch(new Function()).then();
}
稍微解释下上面这个例子,就是拦截拿到内容 然后返回 代码也清晰不多累赘 全是爬坑干货 欢迎一起爬坑