Python爬虫(四)—深入学习Selenium

in python爬虫 with 0 comment

前言

以下关于Selenium的内容讲解,强烈推荐深入了解的查看官方文档。

英文版:Selenium、 https://selenium-python.readthedocs.io/

Selenium介绍安装

浏览器下载地址
Chromehttps://sites.google.com/a/chromium.org/chromedriver/downloads
Edgehttps://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
Firefoxhttps://github.com/mozilla/geckodriver/releases
Safarihttps://webkit.org/blog/6900/webdriver-support-in-safari-10/

Selenium 的基本使用

from selenium import webdriver

browser = webdriver.Chrome()

browser.get("http://www.baidu.com")
print(browser.page_source)
browser.close()
from selenium import webdriver

browser = webdriver.Chrome()

browser.get("http://www.taobao.com")
input_first = browser.find_element_by_id("q")
input_second = browser.find_element_by_css_selector("#q")
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(input_first)
print(input_second)
print(input_third)
browser.close()

CSS选择器、xpath选择器 可以直接采用chrome浏览器的右击复制,如下图:

chrome快捷复制节点.png

from selenium import webdriver

from selenium.webdriver.common.by import By

browser = webdriver.Chrome()

browser.get("http://www.taobao.com")

# 匹配的元素的ID
# input_first_By_ID = browser.find_element_by_id('q')
# 以下类似,都有两种方式。
input_first_By_ID = browser.find_element(By.ID, "q")

# 匹配的元素的xpath定位器
input_first_By_XPATH = browser.find_element(By.XPATH, "//*[@id=\"q\"]")

# 匹配的元素的name属性
input_first_By_NAME = browser.find_element(By.NAME, "q")

# 匹配的元素的文本
input_first_LINK_TEXT = browser.find_element(By.LINK_TEXT, "领淘金币抵钱")

# 匹配的元素的标签名称,例如:h1,a,span
input_first_CSS_SELECTOR = browser.find_element(By.CSS_SELECTOR, "#q")

print(input_first_CSS_SELECTOR)
browser.close()
"""
<selenium.webdriver.remote.webelement.WebElement (session="4ee277c466be7248d4c7e078cbf927db", element="0.09114132600392044-1")>
"""

疑问查看:WebDriver API https://selenium-python.readthedocs.io/api.html

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()

url = "http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
actions = ActionChains(browser)
actions.drag_and_drop(source, target)
actions.perform()
from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("http://www.taobao.com")
input_str = browser.find_element_by_id('q')
input_str.send_keys("ipad")
time.sleep(1)
input_str.clear()
input_str.send_keys("MakBook pro")
button = browser.find_element_by_class_name('btn-search')
button.click()
time.sleep(3)
browser.close()

chromeOptions

chromeOptions 是一个配置 chrome 启动是属性的类。通过这个类,我们可以为chrome配置如下参数(这个部分可以通过selenium源码看到)

# 实例化一个启动参数对象
options = webdriver.ChromeOptions()
# 设置浏览器窗口大小
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 启动浏览器
browser = webdriver.Chrome(chrome_options=chrome_options)

常用的参数如下:

  1. 设置为开发者模式 ('excludeSwitches', ['enable-automation')
  2. 不加载图片,加快访问速度 ("prefs", {"profile.managed_default_content_settings.images": 2})
  3. 模拟手机打开网页 ('mobileEmulation', {'deviceName': 'Apple iPhone 4'})
  4. 关闭保存密码提示
options = webdriver.ChromeOptions() 
prefs = {} 
# 设置这两个参数就可以避免密码提示框的弹出
prefs['credentials_enable_service'] = False 
prefs['profile.password_manager_enabled'] = False 
options.add_experimental_option('prefs', prefs) 
browser = webdriver.Chrome(chrome_options=options)
启动参数作用
--user-agent="设置请求头的User-Agent
--window-size=1366,768设置浏览器分辨率(窗口大小)
--headless无界面运行(无窗口)
--start-maximized最大化运行(全屏窗口)
--incognito隐身模式(无痕模式)
--disable-javascript禁用javascript
--disable-infobars禁用浏览器正在被自动化程序控制的提示
class Options(object):
    KEY = "goog:chromeOptions"

    def __init__(self):
        self._binary_location = ''
        self._arguments = []
        self._extension_files = []
        self._extensions = []
        self._experimental_options = {}
        self._debugger_address = None
        self._caps = DesiredCapabilities.CHROME.copy()

执行JavaScript

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.zhihu.com/explore")
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')
from selenium import webdriver

browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
logo = browser.find_element_by_id('zh-top-link-logo')
print(logo)
print(logo.get_attribute('class'))  # zu-top-link-logo
print(logo.text)  # 知乎
from selenium import webdriver

browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
input = browser.find_element_by_class_name('zu-top-add-question')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
"""
0.8051799500933536-1
{'x': 758, 'y': 7}
button
{'height': 32, 'width': 66}
"""

Frame

涉及到切入到frame中以及切出来的问题,常用的是switch_to.from()和switch_to.parent_frame(),如以下代码:

import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

browser = webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)

browser.switch_to.frame('iframeResult')

source = browser.find_element_by_css_selector('#draggable')
print(source)
try:
    logo = browser.find_element_by_class_name('logo')
except NoSuchElementException:
    print('NO LOGO')

browser.switch_to.parent_frame()

logo = browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)

等待 Waits

以下内容基本摘自:https://selenium-python.readthedocs.io/waits.html

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Firefox()
driver.get("http://somedomain/url_that_delays_loading")
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "myDynamicElement"))
    )
finally:
    driver.quit()

在抛出TimeoutException之前等待最多10秒,除非它发现元素在10秒内返回。默认情况下,WebDriverWait每500毫秒调用一次ExpectedCondition,直到它成功返回。对于所有其他ExpectedCondition类型,ExpectedCondition类型的布尔返回true或非null返回值成功返回。

预期条件目的
title_is标题是某内容
title_contains标题包含某内容
presence_of_element_located元素加载出,传入定位元组,如(By.ID,'p')
visibility_of_element_located元素可见,传入定位元组
visibility_of可见,传入元素对象
presence_of_all_elements_located所有元素加载出
text_to_be_present_in_element某个元素文本包含某文字
text_to_be_present_in_element_value某个元素值包含某文字
frame_to_be_available_and_switch_to_itframe加载并切换
invisibility_of_element_located元素不可见
element_to_be_clickable元素可点击
staleness_of判断一个元素是否仍在DOM,可判断页面是否已经刷新
element_to_be_selected元素可选择,传元素对象
element_located_to_be_selected元素可选择,传入定位元组
element_selection_state_to_be传入元素对象以及状态,相等返回True,否则返回False
element_located_selection_state_to_be传入定位元组以及状态,相等返回True,否则返回False
alert_is_present是否出现Alert
    • 自定义等待条件
      如果以前的便捷方法都不符合您的要求,您还可以创建自定义等待条件。可以使用带有__call__方法的类创建自定义等待条件,该方法在条件不匹配时返回False。示例 : 略...
  • from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.implicitly_wait(10)
    browser.get('https://www.zhihu.com/explore')
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input)

    其他操作

    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com/')
    browser.get('https://www.taobao.com/')
    browser.get('https://www.python.org/')
    browser.back()
    time.sleep(1)
    browser.forward()
    browser.close()
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    
    browser.add_cookie({'name': 'name', 'domain': 'www.zhihu.com', 'value': 'luozheng'})
    
    print(browser.get_cookies())
    browser.delete_all_cookies()
    print(browser.get_cookies())
    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()')
    print(browser.window_handles)
    browser.switch_to_window(browser.window_handles[1])
    browser.get('https://www.taobao.com')
    time.sleep(1)
    browser.switch_to_window(browser.window_handles[0])
    browser.get('https://python.org')
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException, NoSuchElementException
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
    except TimeoutException:
        print('Time Out')
    try:
        browser.find_element_by_id('hello')
    except NoSuchElementException:
        print('No Element')
    finally:
        browser.close()

    selenium 模拟手机

    有些操作,需要模拟手机才能继续进行,例如微信的公众号、服务号。直接上代码,模拟手机型号等属性。

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    
    mobile_emulation = {"deviceName": "Pixel 2 XL",
                        }
    options = Options()
    options.add_experimental_option("mobileEmulation", mobile_emulation)
    driver = webdriver.Chrome(chrome_options=options)
    ...

    PhantomJS

    from selenium import webdriver
    
    browser = webdriver.PhantomJS()
    
    browser.get("http://www.baidu.com")
    print(browser.page_source)
    browser.close()

    由于selenium对phantomjs不再支持,因此会警告:UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead

    Responses