叠甲:本文写作目的仅为本人学习参考、实现功能为课程实验要求、本文写作过程处于作者对于爬虫技术学习的过程,部分代码编写时水平还不够,出现错误很常见、部分变量名字又臭又长,水平有限
项目源码
2023年:开始做实验时,在网上找知乎自动登录的代码,使用cookie的都还可以用,使用账号密码的已经不能在用了,也就是知乎在2022-2023年期间升级了反爬功能,使用自动化工具控制浏览器直接登录知乎已经不可行。知乎貌似还在加强反爬,也不知道这些代码还能用到什么时候。
调用的库
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import json
import os
import re
import threading
from concurrent.futures import ThreadPoolExecutor
这一部分和下一部分在写代码时对于selenium及xpath的设计不太熟悉,因此很多xpath看上去用的很蠢(以后可能会进行优化,暂时没有修改的想法)
按照本实验要求:实现手动扫描二维码即可(后续计划添加QQ登录的两种方式)
(main函数中实现通过微博第三方登录的“自动”登录)
if __name__ == '__main__':WeiBo_usr = '3247842625@qq.com'WeiBo_pwd = 'irontys'Login = Login_ZhiHu('https://zhihu.com',chrome_ports[0])Login.third_party_WeiBo_login(WeiBo_usr,WeiBo_pwd)Login.sign_cookie()# 在实际项目中删去sleepsleep(100000)
# 之后会用做多线程加速的端口号列表,可修改,
# 注:不要使用range(9243,9253)的端口号,
# 后于多线程加速时会用到
chrome_ports = ['9222','9223','9224','9225','9226']
# 设置信息搜集的浏览器界面是否可见
# 若不可见,visible = 0
# 不可见的实现方法是
# 把打开的浏览器窗口的position
# 设置到屏幕以外的区域,不同电脑
# 根据分辨率不同可能需要修改
# window_position变量中的x的值
visible = 1
if visible:window_position = {'x':'0','y':'0'}else:window_position = {'x':'4000','y':'0'}class Login_ZhiHu():def __init__(self,url,chrome_port):# 连接chrome浏览器,否则会出现知乎搜索功能不可用,某用户的回答不可见,使用前需要将chrome.exe所在文件夹的路径添加到全局变量中cmd = 'chrome.exe --remote-debugging-port='+ chrome_port + ' --window-position='+ window_position['x'] + ',' + window_position['y'] + ' --user-data-dir=\"E:\Iront\StudyItems\TC\Crouses\ContentSecurity\EX1_ZhiHu_Info_collention\project\chrome_user_data_'+ chrome_port + '\"'os.popen(cmd)options = webdriver.ChromeOptions()options.add_experimental_option("debuggerAddress", "127.0.0.1:" + chrome_port) # 前面设置的端口号# chrome在后台运行,没有显示在屏幕最上方,可能会导致一些元素无法被定位或加载# 下两行的option可以避免后台运行chrome导致元素无法加载的问题options.add_argument("--headless") # 不清楚这个option是否有用self.url = urlself.driver = webdriver.Chrome(options=options)self.login_cookie = Noneself.driver.set_window_position(int(window_position['x']), int(window_position['y']))
Login_ZhiHu类中的third_party_WeiBo_login函数
def third_party_WeiBo_login(self,usr,pwd):self.driver.get(self.url)self.driver.maximize_window()# 点击通过微博登录self.driver.find_element(By.XPATH,'//*[@id="root"]/div/main/div/div/div/div/div[2]/div/div[3]/span/button[3]').click() # 操作刚打开的微博第三方登陆界面all_handles = self.driver.window_handles# 稍后从第三方登录页面切换回知乎主页会用到ZhiHu_Handle = all_handles[0] # 切换到微博登陆界面句柄WeiBo_Handle = all_handles[1] self.driver.switch_to.window(WeiBo_Handle)# 切换到输入用户名和密码的界面self.driver.find_element(By.XPATH,'//*[@id="jump_login_url_a"]').click()# 等待加载页面,下同try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="username"]')))except:print("Login Failed! Check u network statue")# 输入微博账号和密码self.driver.find_element(By.XPATH,'//*[@id="username"]').send_keys(usr)self.driver.find_element(By.XPATH,'//*[@id="password"]').send_keys(pwd) sleep(1)# 登录self.driver.find_element(By.XPATH,'//*[@id="vForm"]/div[2]/div/ul/li[7]/div[1]/input').click() try: WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="message_sms_login"]')))except:print("Login Failed! Check u network statue")# 点击扫码验证self.driver.find_element(By.XPATH,'//*[@id="qrCodeCheck"]').click()# find 验证码的链接img_src = self.driver.find_element(By.XPATH,'//*[@id="qrcode"]')src = img_src.get_attribute("src")try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="outer"]/div/div[2]/div/div[2]/div[2]/p/a[1]')))except:print("Login Failed! Check u network statue")# 确认授权self.driver.find_element(By.XPATH,'//*[@id="outer"]/div/div[2]/div/div[2]/div[2]/p/a[1]').click()self.driver.switch_to.window(ZhiHu_Handle)
Login_ZhiHu类中的其他函数:sign_cookie、cookie_login、prepared_drive
# 将cookie存储起来到指定文件def sign_cookie(self):try:WebDriverWait(self.driver,10000).until(EC.presence_of_element_located((By.XPATH, '//*[@id="Popover1-toggle"]')))except:print("Waiting for you to scan for a long time -_-")dictCookies = self.driver.get_cookies() # 获取list的cookiesjsonCookies = json.dumps(dictCookies) # 转换成字符串保存with open('ZhiHu_cookies.txt', 'w') as f:f.write(jsonCookies)print('cookies保存成功!')# 为下一步多线程操作做准备,也可用作调试,即可避免每次调试重新扫码登录def cookie_login(self):self.driver.get(self.url)# 还需要在开头加上:# -*- coding: utf-8 -*-,否则open会出问题if self.login_cookie == None:with open('ZhiHu_cookies.txt', 'r', encoding='utf-8') as f:self.login_cookie = listCookies = json.loads(f.read())else:listCookies = self.login_cookie# 往driver里添加cookiesfor cookie in listCookies:cookie_dict = {'domain': '.zhihu.com','name': cookie.get('name'),'value': cookie.get('value'),"expires": '','path': '/','httpOnly': False,'HostOnly': False,'Secure': False}self.driver.add_cookie(cookie_dict)# 此处没有必要sleepself.driver.refresh()# 为下一个用于搜集指定用户信息的类:User_ZhiHu传递driverdef prepared_drive(self):return self.driver
基本属性信息:用户名、性别、一句话介绍、居住地、所在行业、职业经历、个人简介
if __name__ == '__main__':WeiBo_usr = '3247842625@qq.com'WeiBo_pwd = 'irontys'login_url = 'https://www.zhihu.com'Login = Login_ZhiHu(login_url,chrome_ports[0])Login.cookie_login()driver_ZhiHu = Login.prepared_drive()# 用于用户信息搜集的类User = User_ZhiHu(driver_ZhiHu)# 该用户只是我随机找的,如有冒犯,万分抱歉username_ZhiHu = '孟冬石榴'home_page_url = User.goto_user_home_page(username_ZhiHu)User.user_basic_information_collection('basic_information.txt')
用于用户信息搜集的类User_ZhiHu
class User_ZhiHu():def __init__(self,driver):# 继承Login_ZhiHu的driverself.driver = driver# 下述参数是为实现定时检测用户信息更新情况而设置的# 第零个元素为用户asks/answers的总数self.answers_edit_statue = ['']*10self.asks_edit_statue = ['']*10self.valid_asks_count = 0self.valid_answers_count = 0
User_ZhiHu类中的goto_user_home_page函数,这部分xpath没有非常丑,后期有可能会修改
目的是将driver切换到“目标”用户(username)的主页,返回值获取url是为了后续多线程加速而设计的
def goto_user_home_page(self,username):try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="Popover1-toggle"]')))except:print("Search Failed! Check u network statue")# //*[@id="SearchMain"]/div/div/div/div/div[2]/div/div# //*[@id="SearchMain"]/div/div/div/div/div[2]/div/div# 输入搜索内容self.driver.find_element(By.XPATH,'//*[@id="Popover1-toggle"]').send_keys(username)# 点击搜索try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div[2]/header/div[2]/div[1]/div/form/div/div/label/button/span')))except:print("Search Failed! Check u network statue")self.driver.find_element(By.XPATH,'//*[@id="root"]/div/div[2]/header/div[2]/div[1]/div/form/div/div/label/button/span').click()# 点击搜索内容为用户try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/main/div/div[1]/div/div/ul/li[2]/a')))except:print("Search Failed! Check u network statue")self.driver.find_element(By.XPATH,'//*[@id="root"]/div/main/div/div[1]/div/div/ul/li[2]/a').click()# 上述操作会被知乎检测出来(可能会被识别出来是爬虫?反正不管怎么sleep,除了综合那一栏,其他栏都不能正常加载,但是人为点击不会出错),但是可以通过刷新页面绕过检测self.driver.refresh()# 等待搜索页面加载try:WebDriverWait(self.driver, 100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="SearchMain"]/div/div/div/div/div[2]/div/div/div/div[1]/div/span/div/a/img')))except:print("Search Failed! Check u network statue")# 将第一个搜索结果视为目标搜索结果self.driver.find_element(By.XPATH,'//*[@id="SearchMain"]/div/div/div/div/div[2]/div/div/div/div[1]/div/span/div/a/img').click()# 切换到目的用户主页的句柄all_handles = self.driver.window_handlesuser_home_Handle = all_handles[1]# 始终保持最多仅有两个窗口,一个是正在处理的,一个是处理完当前串口需要返回的self.driver.close() self.driver.switch_to.window(user_home_Handle)# 切换窗口后,再次使用该语句,可以新打开的窗口解决元素在headless模式下不可见的问题 # 看着晃眼睛,而且现在我质疑这条语句的有效性,先删掉# self.driver.set_window_size(1920, 1080)# self.driver.maximize_window() return self.driver.current_url
User_ZhiHu类中的user_basic_information_collection函数,最终的信息存储在 user_information = {}这个字典中,界面没有显示的个人信息视为用户未设置,其值设为’Not Found’
def user_basic_information_collection(self,output_filename):user_information = {}try:WebDriverWait(self.driver, 50).until(EC.presence_of_element_located((By.XPATH, '//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]')))except:print("Search Failed! Check u network statue")# xpath value is not //*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]/text()username = self.driver.find_element(By.XPATH,'//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]').textuser_information['用户名'] = username####################获取一句话介绍#######################declaration = self.driver.find_element(By.XPATH,'//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[2]').textif declaration == '':declaration = 'empty'user_information['一句话介绍'] = declaration####################获取用户性别#######################try:self.driver.find_element(By.CSS_SELECTOR,'svg.Zi.Zi--Male').get_attribute("class")gender = 'Male'except:try:self.driver.find_element(By.CSS_SELECTOR,'svg.Zi.Zi--Female').get_attribute("class")gender = 'Female'except:gender = 'Not Found'user_information['性别'] = gender####################查找'居住地', '所在行业','职业经历','个人简介'等信息######################## 点击“查看详细资料”self.driver.find_element(By.XPATH,'//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[3]/button').click()try:WebDriverWait(self.driver, 50).until(EC.presence_of_element_located((By.XPATH, '//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[2]/div/div')))except:print("Search Failed! Check u network statue")elements = self.driver.find_elements(By.CSS_SELECTOR,'div.ProfileHeader-detailItem')# labels = []# 遍历每个元素,并获取其子元素和的inner HTML字符串for element in elements:label = element.find_element(By.CSS_SELECTOR,'span.ProfileHeader-detailLabel').textraw_value = element.find_element(By.CSS_SELECTOR,'div.ProfileHeader-detailValue')value = re.sub("<[^>]+>", "", raw_value.get_attribute("innerHTML"))# labels.append(label)if label in ['居住地', '所在行业','职业经历']:if value == '': value = 'empty'user_information[label] = valueif label == '个人简介':# 使用正则表达式去除所有尖括号及尖括号以内的内容re.sub(r'<[^>]*>', '', html)personal_profile = re.sub(r'<[^>]*>', '', self.driver.find_element(By.CSS_SELECTOR,'div.ztext.ProfileHeader-detailValue').get_attribute("innerHTML"))user_information[label] = personal_profile# print(labels)user_information_tag_list = ['用户名','性别','一句话介绍','居住地','所在行业','职业经历','个人简介']for user_information_tag in user_information_tag_list:if user_information_tag not in user_information:user_information[user_information_tag] = 'Not Found'
三、社交关系信息、动态信息及监控
所有关注人和粉丝(如果关注人数量或者粉丝数量超过10,则只采集前10个),每个人的信息包括用户昵称、链接地址、回答问题数、文章数、关注者人数。
# 关注的人和粉丝# followers_information存储关注的人或粉丝所有信息,是个字典数组,有效则"isEmpty"= 0,否则为1def user_relationship_information_collection(self,follower_XPATH,followings_or_followers,output_filename):# 点击关注按钮(ps:这里如果是需要搜集“关注该用户的人“,是不需要进行下面这条语句的,但是执行搜索“该用户关注的人”的信息时,又是只需要下面这条语句,而不需要下下条语句,但是为了调用接口函数的统一性,这两条语句都写在这里)self.driver.find_element(By.XPATH,'//*[@id="ProfileMain"]/div[1]/ul/li[9]/a').click()# 点击“该用户关注的人”或者“关注该用户的人”按钮self.driver.find_element(By.XPATH,follower_XPATH).click()# 点击后若不刷新界面,受反爬机制的的影响,页面不能正常显示,此处通过刷新页面的方法绕过反扒机制self.driver.refresh()# 分类分别搜集关注着和粉丝信息的情况,并统计关注着和粉丝的总数followings_or_followers_count = self.driver.find_elements(By.XPATH,'//strong[@class="NumberBoard-itemValue"]')if followings_or_followers == 'followings':followers_count = int(followings_or_followers_count[0].get_attribute("title"))else:followers_count = int(followings_or_followers_count[1].get_attribute("title"))# 设置搜集信息是的有效采集信息条数# 如果粉丝或关注的人数量多于10,取10# 否则取相应数目valid_followers_count = min(followers_count,10)# followers_information存储关注的人或粉丝所有信息,是个字典数组,有效则"isEmpty"= 0,否则为1followers_information = [{"isEmpty": 0} for i in range(10)]if valid_followers_count == 0:followers_information = [{"isEmpty": 1} for i in range(10)]else: for i in range(valid_followers_count,10):followers_information[i]['isEmpty'] = 1follower_information_tag_list = ['用户昵称','链接地址','回答数','文章数','关注者数']try:WebDriverWait(self.driver, 50).until(EC.presence_of_element_located((By.XPATH, '//div[@class="List-item"]')))except:print("Search Failed! Check u network statue")# 定位一个用户信息栏list_items = self.driver.find_elements(By.XPATH,'//div[@class="List-item"]')# 从这里开始我才开始了解xpath的细节,而不是再复制源码中的xpath值..index = 0for list_item in list_items[0:valid_followers_count]:# name和链接地址在同一个a标签中follower_href_and_name = list_item.find_element(By.XPATH,'.//span[@class="UserLink"]/div/a')followers_information[index]['用户昵称'] = follower_href_and_name.textfollowers_information[index]['链接地址'] = follower_href_and_name.get_attribute("href")# 是当前follower的[{'回答':'62' },{'文章':'1'},{'关注者':'151'}]信息followers_answers_articles_followers = list_item.find_elements(By.XPATH,'.//div[@class="ContentItem-status"]/span') tag_list = ['回答','文章','关注者']for tag in tag_list:followers_information[index][tag+'数'] = '0'for follower_answers_articles_followers in followers_answers_articles_followers:if tag == follower_answers_articles_followers.text.split(' ')[1]:followers_information[index][tag+'数'] = follower_answers_articles_followers.text.split(' ')[0]index += 1
相关内容