Python爬虫(二)— Python3内置模块 Urllib
in python爬虫 with 0 comment

Python爬虫(二)— Python3内置模块 Urllib

in python爬虫 with 0 comment

前言

以下关于Urllib的内容讲解,强烈推荐深入了解的查看官方文档。

英文版:Urllib https://docs.python.org/3/library/urllib.html

Urllib

urlopen

urlopen能用于一些简单的请求,不需要设置header信息的。

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request
import urllib.parse
import urllib.error
import socket

"""
1.url:就是打开的测试地址  http://httpbin.org
2.data:发送post请求必须设置的参数,通过bytes(urllib.parse.urlencode())可以将post的数据进行转换放到urllib.request.urlopen的data参数中。
3.timeout:是一个超时设置,超时则抛出异常
"""
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding='utf8')
try:
    response = urllib.request.urlopen(url='http://httpbin.org/post', data=data, timeout=5)
    print(response.read())
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('超时...')
import urllib.request

response = urllib.request.urlopen(url='https://www.baidu.com/')
print(type(response))
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
"""
<class 'http.client.HTTPResponse'>
200
[('Accept-Ranges', 'bytes'), ('Cache-Control', 'no-cache'),  ......]
BWS/1.1
"""

request

如果需要对请求设置header信息,就需要使用request。主要是对如何增加请求头进行说明:

from urllib import request, parse

url = 'http://httpbin.org/post'
# 第一种方式,构造header字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
    'Host': 'httpbin.org'
}
# 第二种方式:req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36')

dict = {
    'word':'hello'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

高级用法

import urllib.request

proxy_handler = urllib.request.ProxyHandler({
    'http': 'http://127.0.0.1:9743',
    'https': 'https://127.0.0.1:9743'
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://httpbin.org/get')
print(response.read())
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+"="+item.value)
"""
BAIDUID=68AF7F00874AE2D8206AC4B524B49EAB:FG=1
BIDUPSID=68AF7F00874AE2D8206AC4B524B49EAB
H_PS_PSSID=1451_21090_18559_29064_28519_29098_28836_28584_26350
PSTM=1558969682
delPer=0
BDSVRTM=0
BD_HOME=0
"""

异常处理

避免发生404 500异常导致的爬虫停止。
URLError,HTTPError,HTTPError是URLError的子类

from urllib import request,error

try:
    response = request.urlopen("http://pythonsite.com/1111.html")
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)
else:
    print("reqeust successfully")
"""
Not Found
404
Date: Mon, 27 May 2019 15:12:43 GMT
Server: Apache
Vary: Accept-Encoding
Content-Length: 207
Connection: close
Content-Type: text/html; charset=iso-8859-1
"""

工具模块 urlparse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)

from urllib.parse import urlparse

o = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html', scheme='https')
print(o)
print(o.scheme, o.port, o.geturl())

"""
ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
http 80 http://www.cwi.nl:80/%7Eguido/Python.html
"""
Responses