Selenium是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera,Edge等。这个工具的主要功能包括:测试与浏览器的兼容性——测试应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成.Net、Java、Perl等不同语言的测试脚本。
一、安装chrome+chromeDriver
二、安装语言包
# step1:需要安装字体库
yum -y install fontconfig
# step2:创建文件夹
mkdir/usr/share/fonts/myfonts
# step3:将C盘下的Windows/Fonts目录下,后缀名为ttc和ttf的文件粘贴复制到在Linux中新建立的文件夹中
# step4:赋权,
chmod -R 755 /usr/share/fonts/myfonts/
# step5:执行ttmkfdir命令,为了搜索目录中所有的字体信息
yum -y install ttmkfdir
# step6:查看字体配置文件中的字体路径
vi/etc/fonts/fonts.conf
# step7:将新字体的缓存进行刷新,目的是为了将刚刚的配置生效
fc -cache
# step8:查看中文字体是否成功添加进去
fc -list
三、使用python代码实现网站的登录
■配置:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from helium import *
from bs4 import BeautifulSoup
import logging
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import pymysql
import traceback
from datetime import datetime
import pickle
class demoSpider():
def __init__(self):
self.batch_id = 4
self.login_batch_kbn = 1
self.login_id = 1
self.root_dir = "/www/python/demo/"
self.cookie_file = self.root_dir + "cookies.pkl"
self.logs_dir = self.root_dir + "logs/"
self.screenshot_dir = self.root_dir + "images/screen/"
self.cache_dir = self.root_dir + "cache/"
# Chrome设定
# 模拟手机
self.user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
self.options = webdriver.ChromeOptions()
#设置chrome浏览器无界面模式
self.options.add_argument('--headless')
self.driver_path = '/usr/local/bin/chromedriver'
self.options.add_argument('user-agent={0}'.format(self.user_agent))
# 使用mobile尺寸窗口打开浏览器
self.options.add_argument("window-size=375,1000")
# 以最高权限运行
self.options.add_argument('--no-sandbox')
# 禁用浏览器提示正在受自动化软件控制
self.options.add_experimental_option('useAutomationExtension', False)
# 防止反爬
self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 指定用户文件夹User Data路径,可以把书签这样的用户数据保存在系统分区以外的分区
self.options.add_argument('--user-data-dir=' + self.cache_dir + str(self.login_id))
# 指定缓存Cache路径
self.options.add_argument('--disk-cache-dir=' + self.cache_dir + str(self.login_id))
# 谷歌文档提到需要加上这个属性来规避bug
self.options.add_argument('--disable-gpu')
# 字体
self.options.add_argument('--lang=ja')
# 解决DevToolsActivePort file doesn't exist的问题
self.options.add_argument("--remote-debugging-port=9222")
# 禁用浏览器弹窗
prefs = {
'profile.default_content_setting_values': {
'notifications': 2
}
}
self.options.add_experimental_option('prefs', prefs)
self.driver = webdriver.Chrome(options = self.options)
# 邮件信息
self.sender = "xxx@qq.com"
self.mail_password = "123456"
self.mail_to = [
"xxx2@qq.com"
]
self.smtp = "smtp.xxx"
self.smtp_port = "465"
# DB情報
self.db_config = {
'host' : "localhost",
'port' : 3306,
'user' : "demo",
'password' : "jAG4PmAe7k8WADjH",
'database' : "demo",
'charset' : 'utf8',
'cursorclass':pymysql.cursors.Cursor,
}
# log
logging.basicConfig(level = logging.INFO,#控制台打印的日志级别
filename = self.logs_dir + 'py_debug_demo_' + time.strftime("%Y%m%d", time.localtime()) + '.log',
filemode = 'a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志
#a是追加模式,默认如果不写的话,就是追加模式
format = '[%(asctime)s] %(message)s'
#日志格式
)
pass
■DB操作:
def get_login_data(self):
try:
logging.info("get demo login data start.")
# 打开数据库连接
db = pymysql.connect(**self.db_config)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
select_sql = "SELECT login_user,login_password FROM " + self.db_demo_login + " WHERE id = " + str(self.login_id) + " AND `status` = 1"
# 使用 execute() 方法执行 SQL 查询
cursor.execute(select_sql)
# 使用 fetchone() 方法获取单条数据
data = cursor.fetchall()
# 关闭数据库连接
db.close()
logging.info("get demo login data end.")
return data[0]
except:
error_message = "get demo login data error.\n"
error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
logging.info(error_message)
■抓取网站数据:
def get_demo_item(self):
try:
logging.info("demo get data start.")
logging.info("product id : " + self.product_id)
# 画面访问
# self.load_and_set_cookie()
self.driver.get(self.products_url.format(self.product_id))
time.sleep(self.sleep_second)
soup = BeautifulSoup(self.driver.page_source, "html.parser")
all_items = []
# 截图
self.driver.execute_script("window.scrollBy(0, 100);")
self.driver.save_screenshot(self.screenshot_dir + "screen_" + str(self.product_id) + ".png")
# 商品图片
try:
div_slick = soup.select_one('div[class="slick-list"]')
if(len(div_slick) > 0):
img_tags = div_slick.find_all('img')
self.delete_demo_item_images()
for img_tag in img_tags:
all_items_images = []
src = img_tag['src']
all_items_images.append({
"product_id": self.product_id,
"product_image": src,
})
# 数据库insert
self.insert_demo_item_images(all_items_images)
except:
error_message = "demo product images is not exist."
logging.info(error_message)
# 商品信息取得
itemInfo = soup.select('div[id="item-info"]')
if(len(itemInfo) > 0):
for item in itemInfo:
try:
# 商品名
product_name = item.select('div.merHeading h1')[0].text.strip()
self.product_name = product_name
# 商品状态
product_status = item.select('span[data-testid="商品状态"]')[0].text.strip()
all_items.append({
"product_id": self.product_id,
"product_url": self.products_url.format(self.product_id),
"product_name": self.product_name,
"product_status": product_status,
})
except:
error_message = "demo product data get error : products_url=[" + self.products_url.format(self.product_id) + "]\n"
error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
logging.info(error_message)
# 商品情报更新
self.update_demo_item(all_items)
else:
all_items.append({
"product_id": self.product_id,
"product_url": self.products_url.format(self.product_id),
"product_name": "",
"product_status": "",
})
logging.info("demo get data end.")
except:
self.error = 1
error_message = "demo get data error : products_url=[" + self.products_url.format(self.product_id) + "]\n"
error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
logging.info(error_message)
self.update_batch_data(self.error_status,error_message)
四、服务器上部署遇到的一些问题
★DevToolsActivePort file doesn’t exist
■报错信息如下: Traceback (most recent call last): File "/www/python/demo/demo.py", line 985, in spider = demoSpider() File "/www/python/demo/demo.py", line 64, in init self.driver = webdriver.Chrome(options = self.options) File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 81, in init desired_capabilities=desired_capabilities) File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 157, in init self.start_session(capabilities, browser_profile) File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session response = self.execute(Command.NEW_SESSION, parameters) File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute self.error_handler.check_response(response) File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.WebDriverException: Message: unknown error: DevToolsActivePort file doesn't exist
■解决案:追加下记参数
options.add_argument("--remote-debugging-port=9222")
★chrome乱码
■解决案:追加下记参数
self.options.add_argument('--lang=ja')