实现效果
百度关键词查排名工具实现结果如下: 1. 关键词 2. 域名 3. 最大查询页数 效果如下:
实现原理
其实原理很简单就是使用机器代替人工进行一个一个的查找。为什么说该功能查百度关键词排名精准无比呢?因为百度搜索结果在渲染的时候会为每一个非广告条目生成排名,如果直接通过curl获取百度查询结果是没有百度提供的排名参数的,只有在网页渲染后才有。
用到的类库
实现代码
rank_baidu.py
import requests
import re
import string
import time
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from PIL import Image
headers={
'Host': 'www.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
class SearchEngine():
"""
搜索引擎
"""
def __init__(self,keyword='',domain='',page_max=10,headers=None,headless=True):
'''
初始化
'''
if headers is None:
headers={
'Host': 'www.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
options = None
if headless:
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
options.add_argument('--dns-prefetch-disable')
options.add_argument('--no-referrers')
options.add_argument('--disable-gpu')
options.add_argument('--disable-audio')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-insecure-localhost')
self._driver = webdriver.Firefox(options=options)
self.keyword = keyword
self.domain = domain
self.page_max = page_max
self.page = 1
self.page_rank = 0
self.rank = 0
self.screenshots = None
def _search_keyword(self):
flag = False
try:
wait = WebDriverWait(self._driver, 10)
driver = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'result')))
# line_list = self._driver.find_element_by_xpath("//div[@id='content_left'][1]").find_elements_by_xpath("//div[@class='result c-container ']")
line_list = self._driver.find_elements_by_xpath("//div[@id='content_left']//div[@class='result c-container ']")
# print('内存:',line_list)
i = 0
for line in line_list:
i = i + 1
# print(type(line))
id = line.get_attribute('id')
#print('id = ',id)
self.rank = id
domin_href = ''
try:
domin_href = line.find_element(By.CLASS_NAME,'c-showurl')
except:
continue
#if domin_href.text.startswith(self.domain) > 0:
if self.domain in domin_href.text:
#print(i,domin_href.text,self.domain)
# self.rank = (self.page -1) * 10 + i
self.page_rank = i
flag = True
#标红
self._driver.execute_script('''
(function(){
var result = document.getElementById(%s)
result.setAttribute('style','border:5px solid red;padding:10px;margin-left:-15px')
})()
''' % id)
self._screenshots()
break
except StaleElementReferenceException as e:
time.sleep(0.5)
return self._search_keyword()
return flag
def _page_down(self):
"""
下一页,成功返回True,否则返回False
"""
# wait = WebDriverWait(self._driver, 10)
# wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'n')))
# self._driver.implicitly_wait(10)
# WebDriverWait(self._driver,5).until(lambda driver: driver.execute_script("console.log('ok')"))
if self.page > self.page_max:
return False
wait = WebDriverWait(self._driver,10)
driver = wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'n')))
for i in self._driver.find_elements_by_class_name("n"):
# print(i.text)
if '下一页' in i.text:
current_window = self._driver.current_window_handle
i.click()
# for window in self._driver.window_handles:
# if window != current_window:
# self._driver.switch_to.window(window)
self.page = self.page + 1
time.sleep(0.5)
return True
return False
def do_rank(self):
'''
获取排名
'''
self._driver.get('https://www.baidu.com')
self._driver.find_element_by_id('kw').send_keys(self.keyword)
self._driver.implicitly_wait(10)
self._driver.find_element_by_id("su").send_keys(Keys.ENTER) #开始搜索
if self._search_keyword() :
self._quit()
return
while self._page_down():
if self._search_keyword():
self._quit()
return
self._quit()
def _screenshots(self):
'''
截图
'''
time_local_now = time.localtime(time.time())
time_now_rub = time.strftime('%Y%m%d%H%M%S',time_local_now)
width = self._driver.execute_script(
"return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);")
height = self._driver.execute_script(
"return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")
self._driver.set_window_size(width + 100, height + 100)
# 保存为截图,同时命名为根据搜索的结果带上时间搓便于识别
out_img_name = './img/baidu_' + time_now_rub + ".png"
self._driver.save_screenshot(out_img_name)
self.screenshots = out_img_name
def _quit(self):
self._driver.quit()
def baidu_rank(wd,domain,pn = 1, rn = 10):
print('https://www.baidu.com/s','wd=',wd,'pn=',(pn-1)*rn,'rn=',rn)
url = 'https://www.baidu.com/s?wd={}&pn={}&rn={}'.format(wd,(pn-1)*rn,rn)
print(url)
with requests.get('https://www.baidu.com/s', params={'wd': wd, 'pn': (pn-1)*rn, 'rn': rn}, timeout=7,headers=headers) as r:
print(r.url)
html = r.text
with open('/tmp/ret.html','w',encoding='utf-8') as f:
f.write(r.text)
#关闭文件
f.close()
if(html.find(domain)> -1) :
print('当前第{}页包含({})'.format(pn,domain))
else:
print('没有找到')
使用pyqt5实现简单界面,我这里使用的是pyqt的designer拖拽工具,随意的放上了必要的几个控件(虽然很丑,但不影响功能)。
main.ui 布局内容如下:
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>SEO</class>
<widget class="QMainWindow" name="SEO">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>800</width>
<height>600</height>
</rect>
</property>
<property name="windowTitle">
<string>MainWindow</string>
</property>
<widget class="QWidget" name="centralwidget">
<widget class="QLabel" name="label">
<property name="geometry">
<rect>
<x>190</x>
<y>50</y>
<width>53</width>
<height>16</height>
</rect>
</property>
<property name="text">
<string>关键词:</string>
</property>
</widget>
<widget class="QLineEdit" name="keyworldText">
<property name="geometry">
<rect>
<x>270</x>
<y>30</y>
<width>251</width>
<height>41</height>
</rect>
</property>
</widget>
<widget class="QLabel" name="label_2">
<property name="geometry">
<rect>
<x>190</x>
<y>120</y>
<width>45</width>
<height>16</height>
</rect>
</property>
<property name="text">
<string>域名:</string>
</property>
</widget>
<widget class="QLineEdit" name="domainText">
<property name="geometry">
<rect>
<x>270</x>
<y>93</y>
<width>251</width>
<height>51</height>
</rect>
</property>
</widget>
<widget class="QLineEdit" name="pageMaxText">
<property name="geometry">
<rect>
<x>650</x>
<y>60</y>
<width>113</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>5</string>
</property>
</widget>
<widget class="QLabel" name="label_3">
<property name="geometry">
<rect>
<x>550</x>
<y>70</y>
<width>101</width>
<height>16</height>
</rect>
</property>
<property name="text">
<string>最大查询页数:</string>
</property>
</widget>
<widget class="QPushButton" name="rankBtn">
<property name="geometry">
<rect>
<x>350</x>
<y>180</y>
<width>93</width>
<height>28</height>
</rect>
</property>
<property name="text">
<string>查询</string>
</property>
</widget>
<widget class="QLabel" name="resultLabel">
<property name="geometry">
<rect>
<x>250</x>
<y>280</y>
<width>281</width>
<height>81</height>
</rect>
</property>
<property name="text">
<string>结果显示在这里</string>
</property>
</widget>
</widget>
<widget class="QMenuBar" name="menubar">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>800</width>
<height>26</height>
</rect>
</property>
</widget>
<widget class="QStatusBar" name="statusbar"/>
</widget>
<resources/>
<connections/>
</ui>
有了main.ui文件,我们可以通过如下命令生成python代码
pyuic5 -o main.py main.ui
main.py代码如下:
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'main.ui'
#
# Created by: PyQt5 UI code generator 5.13.2
#
# WARNING! All changes made in this file will be lost!
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_SEO(object):
def setupUi(self, SEO):
SEO.setObjectName("SEO")
SEO.resize(800, 600)
self.centralwidget = QtWidgets.QWidget(SEO)
self.centralwidget.setObjectName("centralwidget")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setGeometry(QtCore.QRect(190, 50, 53, 16))
self.label.setObjectName("label")
self.keyworldText = QtWidgets.QLineEdit(self.centralwidget)
self.keyworldText.setGeometry(QtCore.QRect(270, 30, 251, 41))
self.keyworldText.setObjectName("keyworldText")
self.label_2 = QtWidgets.QLabel(self.centralwidget)
self.label_2.setGeometry(QtCore.QRect(190, 120, 45, 16))
self.label_2.setObjectName("label_2")
self.domainText = QtWidgets.QLineEdit(self.centralwidget)
self.domainText.setGeometry(QtCore.QRect(270, 93, 251, 51))
self.domainText.setObjectName("domainText")
self.pageMaxText = QtWidgets.QLineEdit(self.centralwidget)
self.pageMaxText.setGeometry(QtCore.QRect(650, 60, 113, 31))
self.pageMaxText.setObjectName("pageMaxText")
self.label_3 = QtWidgets.QLabel(self.centralwidget)
self.label_3.setGeometry(QtCore.QRect(550, 70, 101, 16))
self.label_3.setObjectName("label_3")
self.rankBtn = QtWidgets.QPushButton(self.centralwidget)
self.rankBtn.setGeometry(QtCore.QRect(350, 180, 93, 28))
self.rankBtn.setObjectName("rankBtn")
self.resultLabel = QtWidgets.QLabel(self.centralwidget)
self.resultLabel.setGeometry(QtCore.QRect(250, 280, 281, 81))
self.resultLabel.setObjectName("resultLabel")
SEO.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(SEO)
self.menubar.setGeometry(QtCore.QRect(0, 0, 800, 26))
self.menubar.setObjectName("menubar")
SEO.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(SEO)
self.statusbar.setObjectName("statusbar")
SEO.setStatusBar(self.statusbar)
self.retranslateUi(SEO)
QtCore.QMetaObject.connectSlotsByName(SEO)
def retranslateUi(self, SEO):
_translate = QtCore.QCoreApplication.translate
SEO.setWindowTitle(_translate("SEO", "MainWindow"))
self.label.setText(_translate("SEO", "关键词:"))
self.label_2.setText(_translate("SEO", "域名:"))
self.pageMaxText.setText(_translate("SEO", "5"))
self.label_3.setText(_translate("SEO", "最大查询页数:"))
self.rankBtn.setText(_translate("SEO", "查询"))
self.resultLabel.setText(_translate("SEO", "结果显示在这里"))
run.py 将界面与业务逻辑整合在一起
import sys
from PyQt5.QtWidgets import QMainWindow, QApplication
from main import Ui_SEO
from rank_baidu import SearchEngine
class MainUI(Ui_SEO, QMainWindow):
def __init__(self, parent=None):
super(MainUI,self).__init__()
self.setupUi(self)
self.registerEvent()
def registerEvent(self):
self.rankBtn.clicked.connect(self.doRank)
def doRank(self):
keyword = self.keyworldText.text()
domain = self.domainText.text()
page_max_text = self.pageMaxText.text()
page_max = int(page_max_text)
se = SearchEngine(keyword=keyword,domain=domain,page_max=page_max)
se.do_rank()
ret = "关键词排名第{}位\n在百度搜索第{}页".format(se.rank,se.page)
self.resultLabel.setText(ret)
if __name__ == "__main__":
app = QApplication(sys.argv)
main = MainUI()
main.show()
sys.exit(app.exec_())
使用pyinstall将pyqt5打包成可执行程序
pyinstaller -F --clean -w run.py
到此结束,直接运行dist目录下面的run.exe文件即可
关注公众号“何三笔记”回复 "查排名工具" 即可获取源码