Node.js will have garbled code problems when crawling Chinese web pages that are not utf-8. For example, NetEase's homepage encoding is gb2312, and garbled codes will have garbled codes when crawling.
The code copy is as follows:
var request = require('request')
var url = 'http://www.163.com'
request(url, function (err, res, body) {
console.log(body)
})
You can use iconv-lite to solve it
Install
The code copy is as follows:
npm install iconv-lite
At the same time, we will modify the user-agent to prevent the website from blocking:
The code copy is as follows:
var originRequest = require('request')
var iconv = require('iconv-lite')
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
function request (url, callback) {
var options = {
url: url,
encoding: null,
headers: headers
}
originRequest(options, callback)
}
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
console.log(html)
})
Solve the garbled problem
Use cheatio to parse HTML
Cheerio can be simply and roughly understood as a server-side jQuery selector. With it, it is much more intuitive than regular
Install
The code copy is as follows:
npm install cheeseio
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
var $ = cheeseio.load(html)
console.log($('h1').text())
console.log($('h1').html())
})
The output is as follows
The code copy is as follows:
NetEase
NetEase
So the question is, the code output by $('h1').html() is Unicode encoded, and NetEase has become NetEase, which has caused some trouble in our character processing.
Solve the problem of "garbled" in cheatio.html()
Looking at the documentation, you can turn off the function of converting entity encoding.
The code copy is as follows:
var $ = cheeseio.load(html)
Change to
The code copy is as follows:
var $ = cheeseio.load(html, {decodeEntities: false})
That's right, the complete code is as follows:
The code copy is as follows:
var originRequest = require('request')
var cheeseio = require('cheerio')
var iconv = require('iconv-lite')
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
function request (url, callback) {
var options = {
url: url,
encoding: null,
headers: headers
}
originRequest(options, callback)
}
var url = 'http://www.163.com'
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
var $ = cheeseio.load(html, {decodeEntities: false})
console.log($('h1').text())
console.log($('h1').html())
})