1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
| import requests import re from html.parser import * import urllib.request import os import csv from bs4 import BeautifulSoup
x = [] state = []
class Scraper(HTMLParser): def handle_starttag(self, tag, attrs): if tag == 'img': attrs = dict(attrs) if(attrs.__contains__('id')): x.append(attrs["src"]) if tag == 'input': attrs = dict(attrs) if attrs.__contains__('name'): if attrs['name'] == '__VIEWSTATE': state.append(attrs['value'])
webpage = requests.get(url="http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/default2.aspx") Cookie = webpage.cookies date = webpage.text parser = Scraper() parser.feed(date) headers = { 'User-Agent': r'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; TheWorld 7)', }
DstDir = os.getcwd() while True: url = "http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/CheckCode.aspx" pic = requests.get(url, cookies=Cookie, headers=headers) if os.path.exists(r''+DstDir+'\\CheckCode.jpg'): os.remove(r''+DstDir+'\\CheckCode.jpg') with open(r''+DstDir+'\\CheckCode.jpg', 'wb')as f: f.write(pic.content) f.close() username = input("输入用户名: ") password = input("输入密码:")
os.startfile(r''+DstDir+'\\CheckCode.jpg') print("验证码在"+DstDir+"\\CheckCode.jpg") ycode = input("输入验证码: ")
payload = { '__VIEWSTATE': state[0], 'txtUserName': username, 'TextBox2': password, 'txtSecretCode': ycode, 'RadioButtonList1': '%D1%A7%C9%FA', 'Button1': "", 'lbLanguage': '', 'hidPdrs': '', 'hidsc': '', } Log_in = r"http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/default2.aspx"
r = requests.post(url=Log_in, data=payload, headers=headers, cookies=Cookie) pat = r'<title>(.*?)</title>' x = re.findall(pat, r.text) if(x[0] == "欢迎使用正方教务管理系统!请登录"): print("登陆失败") else: print("登陆成功") catch = '<span id="xhxm">(.*?)</span></em>' name = re.findall(catch, r.text) name = name[0] name = name[:-2] print(name) break name = str(name).replace(r'\x', '%') name = name.upper() name = name[2:]
lheaders = { 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36', 'Referer': 'http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/xs_main.aspx?xh='+username }
html = requests.get("http://202.199.155.35/(qftqpd452cwggh55jlgisyuc)/xsdjkscx.aspx?xh="+username + "&xm="+name+"&gnmkdm=N121606", cookies=Cookie, headers=lheaders)
selectall = r'<td>(.*?)</td>'*10 result = re.findall(selectall, html.text) xm = result[0] forma = [] csvfile = open(DstDir+'\\'+name+'的历年英语等级考试.csv', 'w', newline='') writer = csv.writer(csvfile) temp = '' for i in range(10): forma.append('') for index in range(10): for item in result: temp = format("% -15s" % str(item[index]).strip()) forma[index] += temp
for each in forma: print(each)
for num, item in enumerate(result): for index, value in enumerate(item): if value == " ": result[num][index] = ''
for item in result: writer.writerow(item) csvfile.close()
if os.path.exists(DstDir+"\\CheckCode.jpg"): os.remove(DstDir+"\\CheckCode.jpg") input("爬虫完成,结果存在"+DstDir+"\\"+name+"的历年英语等级考试.csv文件下")
|