Huge Lemon的博客

Python学习笔记 —— 爬取教务网明日课程并发送微信好友

2018-05-18

简介

  • 实现将登录教务网爬取到的课程内容通过微信消息的方式发送给好友,提醒明日课程
  • 主要用到itchat、requests、BeautifulSoup等库
  • 登录的用户名和密码以及爬取下来的验证码需要自己输入
  • 获取本地日期,判断星期几,加一天就是明日的星期数。

源代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#-*-coding:utf-8-*-
import os
import re
from lxml import etree
import requests
import sys
from bs4 import BeautifulSoup
import importlib,sys
from PIL import Image,ImageEnhance
from datetime import datetime,date
import time
import datetime
import itchat


def getInfor(response, xpath):
content = response.content.decode('gb2312') # 网页源码是gb2312要先解码
selector = etree.HTML(content)
infor = selector.xpath(xpath)[0]
return infor

def get_week_day(date):
week_day_dict = {
0 : '星期一',
1 : '星期二',
2 : '星期三',
3 : '星期四',
4 : '星期五',
5 : '星期六',
6 : '星期日'
}
day = date.weekday()
if(day == 6):
tomorrow = 0
else:
tomorrow = day + 1
return week_day_dict[tomorrow]

def get_tomorrow_column(date):
column_dict = {
'星期一' : 3,
'星期二' : 4,
'星期三' : 5,
'星期四' : 6,
'星期五' : 7,
'星期六' : 8,
'星期日' : 9
}
return column_dict[date]

def main():
# 设置编码
importlib.reload(sys)
# 初始参数,自己输入的学号,密码。
studentnumber = input("学号:")
password = input("密码:")
# 访问教务系统,前面分析过了,提交数据时要用这个值。先得到__VIEWSTATE的值。
s = requests.session()
url = "http://202.199.155.35/(0r3mgfvgmxuxkvjz4lf1v5a1)/default2.aspx"
response = s.get(url)
selector = etree.HTML(response.content)
__VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0]
# 获取验证码并下载到本地
imgUrl = "http://202.199.155.35/(0r3mgfvgmxuxkvjz4lf1v5a1)/CheckCode.aspx?"
imgresponse = s.get(imgUrl, stream=True)
print (s.cookies)
image = imgresponse.content
DstDir = os.getcwd()+"\\"
print("保存验证码到:"+DstDir+"code.jpg"+"\n")
try:
with open(DstDir+"code.jpg", "wb") as jpg:
jpg.write(image)
except IOError:
print("IO Error\n")
finally:
jpg.close
# 手动输入验证码
image = Image.open('{}/code.jpg'.format(os.getcwd()))
image.show()
code = input("验证码是:")
time.sleep(2)
try:
image.close()
except:
print()


# 构建post数据
data = {
"__VIEWSTATE": __VIEWSTATE,
"txtUserName": studentnumber,
"TextBox2": password,
"txtSecretCode": code,
"Button1": "",
}
# 提交表头,里面的参数是电脑各浏览器的信息。模拟成是浏览器去访问网页。
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36",
}
# 登陆教务系统
response = s.post(url, data=data, headers=headers)
print ("成功进入")
# 得到登录信息,个人感觉有点多余。

# 获取学生基本信息
text = getInfor(response, '//*[@id="xhxm"]/text()')
text = text.replace(" ", "")
print (text)
# 抓一下名字
catch = '<span id="xhxm">(.*?)</span></em>'
name = re.findall(catch, response.text)
name = name[0]
name = name[:-2]

name = str(name).replace(r'\x', '%') # 扩大适用性
name = name.upper()
name = name[2:]


# 获取课表,kburl是课表页面url,为什么有个Referer参数,这个参数代表你是从哪里来的。就是登录后的主界面参数。这个一定要有。
kburl = "http://202.199.155.35/(0r3mgfvgmxuxkvjz4lf1v5a1)/xskbcx.aspx?xh=" + studentnumber + "&xm=" + name + "&gnmkdm=N121603"
headers = {
"Referer": "http://202.199.155.35/(0r3mgfvgmxuxkvjz4lf1v5a1)/xs_main.aspx?xh=" + studentnumber,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36",
}
response = s.get(kburl, headers=headers)
# html代表访问课表页面返回的结果就是课表。下面做的就是解析这个html页面。
html = response.content.decode("gb2312")
soup = BeautifulSoup(response.text,"html.parser")
s=soup.find(id='Table1')
trs=s.find_all('tr')

itchat.auto_login(hotReload=True)
while(True):
friend = input('请输入好友昵称(输入0退出):')
if(friend == "0"):
return
users = itchat.search_friends(name= (u''+str(friend)))
userName = users[0]['UserName']

# 打印全部课表
# for i in range(11):
# td1=trs[i].find_all('td')
# for j in td1:
# print(j.text[0:6], end='\t')
# print()

#print(soup)
# selector = etree.HTML(html)
# content = selector.xpath('//[@id="Table1"]/tbody/tr/td/text()')
# for each in content:
# print (each)

now = datetime.datetime.now()
delta = datetime.timedelta(days=1)
n_days = now + delta
tomorrow = get_week_day(datetime.datetime.now())
date_info = "明天是 " + n_days.strftime('%Y-%m-%d') + " " + tomorrow
print("明天是 " + n_days.strftime('%Y-%m-%d') + " " + tomorrow)
itchat.send(str(date_info),toUserName = userName)

if(n_days.strftime('%m') == "05" and n_days.strftime('%d') >= "01" and n_days.strftime('%d') < "07"):
info = "明天没有课程哦,好好享受五一假期!"
print("\t明天没有课程哦,好好享受五一假期!")
itchat.send(str(info),toUserName = userName)
return
if(tomorrow == "星期六" or tomorrow == "星期日"):
info = "明天没有课程哦!"
print("\t明天没有课程哦!")
itchat.send(str(info),toUserName = userName)
return


# 打印明天课表
for i in range(11):
if(i==0):
continue
td1=trs[i].find_all('td')
count = 1 # 列数
column = get_tomorrow_column(tomorrow) # 上午第一节课和下午第一节课的标签前面包含有“上午”和“下午”列
for j in td1:
if(i == 2 or i == 6 or i == 10):
if(count >= 8):
continue
if(count == column):
print(j.text, end='\t')
info = j.text
if not(info.strip()==''):
itchat.send(str(info),toUserName = userName)
else:
if(count >= 7):
continue
if(count == column-1):
print(j.text, end='\t')
info = j.text
if not(info.strip()==''):
itchat.send(str(info),toUserName = userName)
count += 1
print()

if(column<8):
info = "——这些是明天的课程,注意周次,请做好课前准备"

itchat.send(str(info),toUserName = userName)
else:
info = "明天没有课程哦!"
print("\t" + info)
itchat.send(str(info),toUserName = userName)

print("退出...")
if os.path.exists(DstDir+"code.jpg"):
#删除文件
os.remove(DstDir+"code.jpg")

if __name__ == '__main__':
main()


心得

  • 在登录教务网时需要保存cookie信息,否则将无法访问后续网页
  • 爬取网页发送请求的一种方法是用requests,还可以用urllib.request 和 urllib.parse
  • 爬取课程表格时,有“早上”、“上午”、“下午”、“晚上”等列比较特殊,它们和各个时间段的第一行放在一起,所以要特殊处理该行
  • 再接再厉!
使用支付宝打赏
使用微信打赏

若你觉得我的文章对你有帮助,欢迎点击上方按钮对我打赏