Crawler error: UnicodeDecodeError:'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

created at 07-15-2021 views: 157

description

After crawling http://www.example.com/ajax-poets/, convert the returned html into a string to prompt an error:

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

After querying, it is found that the data is compressed by gzip, which causes the use of direct decoding to fail.

Original code:

from urllib.parse import quote, urlencode
from urllib import request

class SpiderMain():

    def craw_pinying_name(self, surname):
        url = "http://www.example.com/ajax-poets/"
        params_dict = {}

        params_dict['token'] = "@@E62A47"
        params_dict['surname'] = surname
        params_dict['kw'] = ""
        params_dict['dynasty'] = ""
        params_dict['poet'] = ""
        params_dict['length'] = ""
        params_dict['language'] = "zh-cn"
        params_dict['shome'] = 1

        html_content = self.post(url, params_dict)

    def post(self, url: str, params: dict) -> str:
        # try:
        data_parse = urlencode(params)
        params = data_parse.encode('utf-8')
        headres = {"User-Agent": random.choice(USER_AGENTS)["User-Agent"],
                   "Cookie": "",
                   "Accept": "text/html, */*; q=0.01",
                   "Accept-Encoding": "gzip, deflate",
                   "Accept-Language": "zh-CN,zh;q=0.9",
                   "keep-alive": "keep-alive",
                   "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                   "Host": "www.example.com",
                   "Origin": "http://www.example.com",
                   "Referer": "http://www.example.com/search.html?by=author",
                   "X-Requested-With": "XMLHttpRequest",
                   }

        req = request.Request(url, headers=headres)
        response = request.urlopen(req, data=params)

        if response.getcode() == 200:
            content = response.read()
            print(response.headers._headers)
            # if ('Content-Encoding', 'gzip') in response.headers._headers:
            #
            #     buff = BytesIO(content)
            #     f = gzip.GzipFile(fileobj=buff)
            #     return f.read().decode('utf-8')
            # else:
            return content.decode('utf-8')

s = SpiderMain()
s.craw_pinying_name("A")

Solution

import gzip
from io import BytesIO
from urllib.parse import quote, urlencode
from urllib import request

class SpiderMain():

    def craw_pinying_name(self, surname):
        url = "http://www.example.com/ajax-poets/"
        params_dict = {}

        params_dict['token'] = "@@E62A47"
        params_dict['surname'] = surname
        params_dict['kw'] = ""
        params_dict['dynasty'] = ""
        params_dict['poet'] = ""
        params_dict['length'] = ""
        params_dict['language'] = "zh-cn"
        params_dict['shome'] = 1

        html_content = self.post(url, params_dict)

    def post(self, url: str, params: dict) -> str:
        # try:
        data_parse = urlencode(params)
        params = data_parse.encode('utf-8')
        headres = {"User-Agent": random.choice(USER_AGENTS)["User-Agent"],
                   "Cookie": "",
                   "Accept": "text/html, */*; q=0.01",
                   "Accept-Encoding": "gzip, deflate",
                   "Accept-Language": "zh-CN,zh;q=0.9",
                   "keep-alive": "keep-alive",
                   "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                   "Host": "www.example.com",
                   "Origin": "http://www.example.com",
                   "Referer": "http://www.example.com/search.html?by=author",
                   "X-Requested-With": "XMLHttpRequest",
                   }

        req = request.Request(url, headers=headres)
        response = request.urlopen(req, data=params)

        if response.getcode() == 200:
            if ('Content-Encoding', 'gzip') in response.headers._headers:

                buff = BytesIO(content)
                f = gzip.GzipFile(fileobj=buff)
                return f.read().decode('utf-8')
            else:
                return content.decode('utf-8')

s = SpiderMain()
s.craw_pinying_name("A")

Add a response with gzip in the header of the request. If there is, use gzip to decode, if not, then directly decode. After practice, it is found that the page returned by the URL is returned randomly, some pages use gzip, and some are not. If you do not make the above judgment, you will encounter another error

gzip.BadGzipFile: Not a gzipped file (b'<d')
created at:07-15-2021
edited at: 07-15-2021: