0%

黑板课爬虫闯关

前言:很久以后以前玩的黑板课爬虫闯关,共写了4关代码

黑板课爬虫闯关第一关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#-*- coding:utf-8 -*-

import requests
from lxml import etree
import re


headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
}
#获取页面
def getHtml(number):
url = 'http://www.heibanke.com/lesson/crawler_ex00/'+number
res = requests.get(url,headers=headers,timeout=30).text
return res
#如果需要继续输入数字,则正则匹配可以成功,
#如果不需要继续输入数字了,那么正则匹配之后会报错,所以报错的时候就是成功的时候
def main():
number = ''
print ('开始第一关..')
while True:
try:

html = getHtml(number)
#对html文本使用 etree.HTML(html)解析,得到Element对象
tree = etree.HTML(html)
#对Element对象使用xpath筛选,返回一个列表

h3 = tree.xpath('/html/body/div/div/div[2]/h3/text()')[0]
print (h3)

if '恭喜' in h3:
url = tree.xpath('/html/body/div/div/div[2]/a/@href')[0]
next_url = 'http://www.heibanke.com' + url
print ('下一关的地址为:%s '%next_url)
#正则表达式,\d 匹配一个数字 + 匹配前一个字符1次或无限次
#re.findall(pattern,string[,flag]) 搜索string,以列表形式返回全部能匹配的子串。

# pattern = re.compile('\d+')
# number = pattern.findall(h3)[0]
number = re.findall('\d+',h3)[0]
except Exception as e:
print (e)
break

if __name__ == '__main__':
main()

#下一关地址为:http://www.heibanke.com/lesson/crawler_ex01/

黑板课爬虫闯关第二关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#-*- coding:utf-8 -*-
import requests

def main():
url = "http://www.heibanke.com/lesson/crawler_ex01/"
wrongAnswer = "您输入的密码错误, 请重新输入"
for psd in range(31):
data = {"username":'admin','password':psd}
response = requests.post(url,data);
if wrongAnswer not in response.text:
print("恭喜! 用户admin密码%d成功闯关, 继续你的爬虫之旅吧"%psd)
break
else:
print("您输入的密码:%d错误, 请重新输入"%psd)

if __name__ == '__main__' :
main()
#http://www.heibanke.com/lesson/crawler_ex02/

黑板课爬虫闯关第三关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#-*- coding:utf-8 -*-

import requests
from lxml import etree


se = requests.session()
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
}

class HBK():
def __init__(self):
self.login_url = "http://www.heibanke.com/accounts/login"
self.username = "somebody"
self.password = "201627"

def getCsrf(self):
res = se.get(url=self.login_url,headers=headers,timeout=30).text
tree = etree.HTML(res)
self.csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]

def login(self):
self.getCsrf()
data = {
"csrfmiddlewaretoken":self.csrf,
"username":self.username,
"password":self.password
}
se.post(url=self.login_url,headers=headers,data=data,timeout=30)
print ('登陆成功')

print ('开始闯关 - 第三关')
spider = HBK()
spider.login()

url = 'http://www.heibanke.com/lesson/crawler_ex02/'
res = se.get(url,headers=headers,timeout=30).text
tree = etree.HTML(res)
#获取csrf
csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]

#猜测密码
def guess(num=1):
print ('guess',num)
data = {
"csrfmiddlewaretoken":csrf,
"username":"somebody",
"password":str(num)
}
res = se.post(url,headers=headers,data=data,timeout=30).text
tree = etree.HTML(res)
h3 = tree.xpath('/html/body/div/div/div[2]/h3/text()')[0]
if not u'错误' in h3:
print ('猜测到正确的密码为%d'%num)
return num #如果没有发现错误两个字,则结束递归
else:
guess(num+1)

guess()
print ('success')

黑板课爬虫闯关第四关

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: UTF-8 -*-

import re
import requests
from bs4 import BeautifulSoup
from threading import Thread


login_website = 'http://www.heibanke.com/accounts/login'
pwd_website = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/'


# 登录记账本
def login_fun():
s = requests.Session()
s.get(login_website) # 访问登录页面获取登录要用的csrftoken
token1 = s.cookies['csrftoken'] # 保存csrftoken
# 将csrftoekn存入字段csrfmiddlewaretoken
dataWebsite1 = {'username': 'user',
'password': 'password',
'csrfmiddlewaretoken': token1
}
s.post(login_website, data=dataWebsite1)
return s


class MyThread(Thread):
def __init__(self, s):
Thread.__init__(self)
self.s = s

def run(self):
global count
global pwdlist
global exit
ruler = re.compile(r'.*>(\d*)<.*') # 提取密码位置和值的正则表达式
while count < 100:
pwdpage = s.get(pwd_website).content
password_pos = BeautifulSoup(pwdpage, 'html.parser').findAll('td', {'title': 'password_pos'})
password_val = BeautifulSoup(pwdpage, 'html.parser').findAll('td', {'title': 'password_val'})
password_pos_list = [] # 密码位置list
password_val_list = [] # 密码值list
if password_pos:
for i in password_pos:
password_pos_list.append(ruler.findall(str(i))[0])
for j in password_val:
password_val_list.append(ruler.findall(str(j))[0])
print (self.name)
print (password_pos_list)
print (password_val_list)
for index in range(0, len(password_pos_list)):
if pwdlist[int(password_pos_list[index]) - 1] == 'x':
count += 1
pwdlist[int(password_pos_list[index]) - 1] = password_val_list[index]
print (count)
if exit == 0:
exit = 1
print (''.join(pwdlist))

if __name__ == '__main__':
s = login_fun()
exit = 0
count = 0
pwdlist = ['x' for i in range(0, 100)]
for i in range(0, 20): # 线程数,可自定义
thread = MyThread(s)
thread.start()
-------------本文结束感谢您的阅读-------------