今天我写了一个很简单的nodejs应用,使用module request同时向jianshu网站发起数个异步请求,通过分页的方式向简书请求我所有的文章列表。
var request = require('request');var jsdom = require("jsdom");var JSDOM = jsdom.JSDOM;const PREFIX = "https://www.jianshu.com";const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";const MAX = 100;var mArticleResult = new Map();var pageNumber;/* a given article: https://www.jianshu.com/p/963cd23fb092 value got from API: /p/5c1d0319dc42 */var lastPageReached = false;var url = "";var aHandlers = [];// use limited for loop to ease testingfor(var i = 0; i < MAX; i++){ pageNumber = i + 1; var url = PAGE + pageNumber; // console.log("current page: " + url); var pageOptions = { url: url, method: "GET", headers: { "Accept": "text/html" } }; aHandlers.push(getArticles(pageOptions, pageNumber)); if( lastPageReached) break;}console.log("promise handler size: " + aHandlers.length);Promise.all(aHandlers).then(function(){ var articleIndex = 0; for (var [key, value] of mArticleResult) { console.log("Article[" + articleIndex++ + "]: " + key + " = " + value); } console.log("done");} );function getArticles(pageOptions, pageNumber) { return new Promise(function(resolve,reject){ var requestC = request.defaults({jar: true}); requestC(pageOptions,function(error,response,body){ if( error){ console.log("error: " + error); resolve(error); } var document = new JSDOM(body).window.document; var content = document.getElementsByTagName("li"); for( var i =0; i < content.length; i++){ var li = content[i]; var children = li.childNodes; for( var j = 0; j < children.length; j++){ var eachChild = children[j]; if( eachChild.nodeName == "DIV"){ var grandChild = eachChild.childNodes; for( var k = 0; k < grandChild.length; k++){ var grand = grandChild[k]; if( grand.nodeName == "A"){ var fragment = grand.getAttribute("href"); if( fragment.indexOf("/p") < 0) continue; // console.log("title: " + grand.text); var wholeURL = PREFIX + fragment; // console.log("url: " + wholeURL); if( mArticleResult.has(grand.text)){ lastPageReached = true; console.log("article size: " + mArticleResult.size); resolve(pageNumber); } mArticleResult.set(grand.text, wholeURL); } } } } }// end of outer loop resolve(pageNumber); }); });}
我观察到一个很奇怪的现象:
当我把下图变量MAX的值设成很小,比如10以下,意思是一次只发送10个以下的并发请求,此时这个nodejs应用工作完全正常。
然而当我把MAX改成100后,发现很多请求的数据并没有从jianshu网站上返回。经过调试发现,这些出问题的请求,接到的statusCode为429.
百度学习了一下429的含义:
当你需要限制客户端请求某个服务的数量,也就是限制请求速度时,该状态码就会非常有用。在此之前,有一些类似的状态码。例如“509 Bandwidth Limit Exceeded”。 因此我这个应用要么降低并发请求的发送频率,要么把异步并发请求改成同步。":