简介

本程序模拟登录正方教务系统获取本人四六级成绩，并将表格内容写入本地表格。

由于正方网页的每一项都需要cookie来进行访问，在用Chrome单独打开子页面时，网页会直接跳到登录前界面。在用调试查看了post后，发现cookie消失了；但是直接从网页上打开子网页可以访问。所以在登录时就要保存登录信息cookie。
因为个人技术问题，这一段代码是我从网上找的，然后修改使其能访问我们学校的网页，而且现在还不能完成验证码自动识别功能，请各位大佬见谅！

源代码

import requests
import re
from html.parser import *
import urllib.request
import os
import csv
from bs4 import BeautifulSoup



x = []
state = []


class Scraper(HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag == 'img':  # 验证码
            attrs = dict(attrs)
            if(attrs.__contains__('id')):
                x.append(attrs["src"])
        if tag == 'input':  # viewstate
            attrs = dict(attrs)
            if attrs.__contains__('name'):
                if attrs['name'] == '__VIEWSTATE':
                    state.append(attrs['value'])


webpage = requests.get(url="http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/default2.aspx")
Cookie = webpage.cookies  # 获取网页cookies
date = webpage.text
parser = Scraper()
parser.feed(date)
headers = {
    'User-Agent': r'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0;  TheWorld 7)',
}

DstDir = os.getcwd()
while True:
    url = "http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/CheckCode.aspx"  # 验证码所在连接
    pic = requests.get(url, cookies=Cookie, headers=headers)
    if os.path.exists(r''+DstDir+'\\CheckCode.jpg'):
        os.remove(r''+DstDir+'\\CheckCode.jpg')
    with open(r''+DstDir+'\\CheckCode.jpg', 'wb')as f:
        f.write(pic.content)
        f.close()
    username = input("输入用户名: ")
    password = input("输入密码：")

    os.startfile(r''+DstDir+'\\CheckCode.jpg')
    print("验证码在"+DstDir+"\\CheckCode.jpg")
    ycode = input("输入验证码: ")

    payload = {
        '__VIEWSTATE': state[0],
        'txtUserName': username,
        'TextBox2': password,
        'txtSecretCode': ycode,
        'RadioButtonList1': '%D1%A7%C9%FA',
        'Button1': "",
        'lbLanguage': '',
        'hidPdrs': '',
        'hidsc': '',
    }
    Log_in = r"http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/default2.aspx"

    r = requests.post(url=Log_in, data=payload,
                      headers=headers, cookies=Cookie)
    # 用正则算了
    pat = r'<title>(.*?)</title>'  # 获取标题的正则表达式
    x = re.findall(pat, r.text)
    if(x[0] == "欢迎使用正方教务管理系统！请登录"):
        print("登陆失败")
    else:
        print("登陆成功")
        # 抓一下名字
        catch = '<span id="xhxm">(.*?)</span></em>'
        name = re.findall(catch, r.text)
        name = name[0]
        name = name[:-2]
        print(name)
        break
name = str(name).replace(r'\x', '%')  # 扩大适用性
name = name.upper()
name = name[2:]

lheaders = {
    'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36',
    'Referer': 'http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/xs_main.aspx?xh='+username  # 扩大适用性
}

html = requests.get("http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/xsdjkscx.aspx?xh="+username +
                    "&xm="+name+"&gnmkdm=N121606", cookies=Cookie, headers=lheaders)



# 最后处理成绩信息
selectall = r'<td>(.*?)</td>'*10
result = re.findall(selectall, html.text)
xm = result[0]  # 项目分离
forma = []
csvfile = open(DstDir+'\\'+name+'的历年英语等级考试.csv', 'w', newline='')
writer = csv.writer(csvfile)
temp = ''
for i in range(10):
    forma.append('')  # 17位的数据存放处理好的数据
for index in range(10):
    for item in result:
        temp = format("% -15s" % str(item[index]).strip())
        forma[index] += temp

for each in forma:
    print(each)

for num, item in enumerate(result):
    for index, value in enumerate(item):  # 处理下result里面的无规则数据
        if value == " ":
            result[num][index] = ''

for item in result:
    writer.writerow(item)
csvfile.close()

if os.path.exists(DstDir+"\\CheckCode.jpg"):
        #删除文件
        os.remove(DstDir+"\\CheckCode.jpg")
input("爬虫完成,结果存在"+DstDir+"\\"+name+"的历年英语等级考试.csv文件下")

Tags: 网络爬虫

← 用itchat库给好友发微信消息 Java学习笔记 —— 新版Windows10配置Java环境变量 →

赏

使用支付宝打赏

使用微信打赏

若你觉得我的文章对你有帮助，欢迎点击上方按钮对我打赏

Python 3.6 模拟登录学校教务系统获取四六级成绩

简介

源代码