Selenium是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera,Edge等。这个工具的主要功能包括:测试与浏览器的兼容性——测试应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成.Net、Java、Perl等不同语言的测试脚本。

一、安装chrome+chromeDriver

二、安装语言包

# step1:需要安装字体库
yum -y install fontconfig

# step2:创建文件夹
mkdir/usr/share/fonts/myfonts

# step3:将C盘下的Windows/Fonts目录下,后缀名为ttc和ttf的文件粘贴复制到在Linux中新建立的文件夹中

# step4:赋权,
chmod -R 755 /usr/share/fonts/myfonts/

# step5:执行ttmkfdir命令,为了搜索目录中所有的字体信息
yum -y install ttmkfdir

# step6:查看字体配置文件中的字体路径
vi/etc/fonts/fonts.conf

# step7:将新字体的缓存进行刷新,目的是为了将刚刚的配置生效
fc -cache

# step8:查看中文字体是否成功添加进去
fc -list

三、使用python代码实现网站的登录

■配置:

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from helium import *
from bs4 import BeautifulSoup
import logging
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import pymysql
import traceback
from datetime import datetime
import pickle

class demoSpider():
    
    def __init__(self):
        self.batch_id = 4
        self.login_batch_kbn = 1
        self.login_id = 1

        self.root_dir = "/www/python/demo/"
        self.cookie_file = self.root_dir + "cookies.pkl"
        self.logs_dir = self.root_dir + "logs/"
        self.screenshot_dir = self.root_dir + "images/screen/"
        self.cache_dir = self.root_dir + "cache/"
        # Chrome设定
		# 模拟手机
        self.user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
        self.options = webdriver.ChromeOptions()
        #设置chrome浏览器无界面模式
        self.options.add_argument('--headless')
        self.driver_path = '/usr/local/bin/chromedriver'
        self.options.add_argument('user-agent={0}'.format(self.user_agent))
        # 使用mobile尺寸窗口打开浏览器
        self.options.add_argument("window-size=375,1000")
        # 以最高权限运行
        self.options.add_argument('--no-sandbox')
        # 禁用浏览器提示正在受自动化软件控制
        self.options.add_experimental_option('useAutomationExtension', False)
        # 防止反爬
        self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
        # 指定用户文件夹User Data路径,可以把书签这样的用户数据保存在系统分区以外的分区
        self.options.add_argument('--user-data-dir=' + self.cache_dir + str(self.login_id))
        # 指定缓存Cache路径
        self.options.add_argument('--disk-cache-dir=' + self.cache_dir + str(self.login_id))
        # 谷歌文档提到需要加上这个属性来规避bug
        self.options.add_argument('--disable-gpu')
        # 字体
        self.options.add_argument('--lang=ja')
		# 解决DevToolsActivePort file doesn't exist的问题
        self.options.add_argument("--remote-debugging-port=9222") 
        # 禁用浏览器弹窗
        prefs = {
            'profile.default_content_setting_values': {
                'notifications': 2
            }
        }
        self.options.add_experimental_option('prefs', prefs)

        self.driver = webdriver.Chrome(options = self.options)
   
        # 邮件信息
        self.sender = "xxx@qq.com"
        self.mail_password = "123456"
        self.mail_to = [
            "xxx2@qq.com"
        ]
        self.smtp = "smtp.xxx"
        self.smtp_port = "465"
        # DB情報
        self.db_config = {
          'host' : "localhost",
          'port' : 3306,
          'user' : "demo",
          'password' : "jAG4PmAe7k8WADjH",
          'database' : "demo",
          'charset' : 'utf8',
          'cursorclass':pymysql.cursors.Cursor,
        }
        # log
        logging.basicConfig(level = logging.INFO,#控制台打印的日志级别
            filename = self.logs_dir + 'py_debug_demo_' + time.strftime("%Y%m%d", time.localtime())  + '.log',
            filemode = 'a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志
            #a是追加模式,默认如果不写的话,就是追加模式
            format = '[%(asctime)s] %(message)s'
            #日志格式
        )
        pass

■DB操作:

def get_login_data(self):
        try:
            logging.info("get demo login data start.")
            # 打开数据库连接
            db = pymysql.connect(**self.db_config)
            # 使用cursor()方法获取操作游标
            cursor = db.cursor()

            select_sql = "SELECT login_user,login_password FROM " + self.db_demo_login + " WHERE id = " + str(self.login_id) + " AND `status` = 1" 

            # 使用 execute()  方法执行 SQL 查询 
            cursor.execute(select_sql)
    
            # 使用 fetchone() 方法获取单条数据
            data = cursor.fetchall()

            # 关闭数据库连接 
            db.close()

            logging.info("get demo login data end.")

            return data[0]
        except:
            error_message = "get demo login data error.\n"
            error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
            logging.info(error_message)

■抓取网站数据:

def get_demo_item(self):
        try:
        logging.info("demo get data start.")
        logging.info("product id : " + self.product_id)
        # 画面访问
        # self.load_and_set_cookie()
        self.driver.get(self.products_url.format(self.product_id))        
        time.sleep(self.sleep_second)

        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        all_items = []

        # 截图
        self.driver.execute_script("window.scrollBy(0, 100);")
        self.driver.save_screenshot(self.screenshot_dir + "screen_" + str(self.product_id) + ".png")

        # 商品图片
        try:
            div_slick = soup.select_one('div[class="slick-list"]')
            if(len(div_slick) > 0):
                img_tags = div_slick.find_all('img')
                self.delete_demo_item_images()
                for img_tag in img_tags:
                    all_items_images = []
                    src = img_tag['src']
                    all_items_images.append({
                        "product_id": self.product_id,
                        "product_image": src,
                    })

                    # 数据库insert
                    self.insert_demo_item_images(all_items_images)
        except:
            error_message = "demo product images is not exist."
            logging.info(error_message)

        # 商品信息取得
        itemInfo = soup.select('div[id="item-info"]')
        if(len(itemInfo) > 0):
            for item in itemInfo:
                try:
                    # 商品名
                    product_name = item.select('div.merHeading h1')[0].text.strip()
                    self.product_name = product_name

                    # 商品状态
                    product_status = item.select('span[data-testid="商品状态"]')[0].text.strip()

                    all_items.append({
                        "product_id": self.product_id,
                        "product_url": self.products_url.format(self.product_id),
                        "product_name": self.product_name,
                        "product_status": product_status,
                    })

                except:
                    error_message = "demo product data get error : products_url=[" + self.products_url.format(self.product_id) + "]\n"
                    error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
                    logging.info(error_message)

                # 商品情报更新
                self.update_demo_item(all_items)

        else:
            all_items.append({
                "product_id": self.product_id,
                "product_url": self.products_url.format(self.product_id),
                "product_name": "",
                "product_status": "",
            })

        logging.info("demo get data end.")
    except:
        self.error = 1
        error_message = "demo get data error : products_url=[" + self.products_url.format(self.product_id) + "]\n"
        error_message += "traceback.format_exc():\n%s" % traceback.format_exc() + "\n"
        logging.info(error_message)
        self.update_batch_data(self.error_status,error_message)

四、服务器上部署遇到的一些问题

★DevToolsActivePort file doesn’t exist
■报错信息如下:
Traceback (most recent call last):
File "/www/python/demo/demo.py", line 985, in
spider = demoSpider()
File "/www/python/demo/demo.py", line 64, in init
self.driver = webdriver.Chrome(options = self.options)
File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 81, in init
desired_capabilities=desired_capabilities)
File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 157, in init
self.start_session(capabilities, browser_profile)
File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/www/python/demo/8c2886dca39e7b692cb378e704072ad4_venv/lib64/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: DevToolsActivePort file doesn't exist

■解决案:追加下记参数

options.add_argument("--remote-debugging-port=9222")
★chrome乱码

■解决案:追加下记参数

self.options.add_argument('--lang=ja')
Avatar photo

By admin

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注