1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
|
import requests,re,execjs,pytorchOcr url = 'http://bj.gsxt.gov.cn/index.html'
headers ={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'bj.gsxt.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', }
s = requests.session() rep = requests.get(url, headers=headers) print('第一次请求', rep.status_code) html = rep.text h = html.replace(';location.href=location.pathname+location.search</script>', '').replace('<script>document.', '') print(h) hhh = execjs.compile(h) cookie_t = hhh.eval('cookie') cookie_s = rep.headers['Set-Cookie'] headers['Cookie'] = cookie_t + ';' + cookie_s rep_2 = requests.get(url, headers=headers) print('第二次请求', rep_2.status_code, 'cookies 为', cookie_t ) html_2 = re.findall('<script>(.*?)</script>', rep_2.text, re.S) he = html_2[0] jj = """ window = {};var navigator = new Object();navigator.userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36";window.navigator = navigator; """
he_1 = he[:he.find(')]);if')]+')]);' yuesi = he_1[:he_1.rfind('var')] cookies = he_1[he_1.rfind('var'):] cookies = cookies[cookies.find('='):] cookies = cookies.replace('=', 'cookies = ') cookies = jj+yuesi+cookies+'return cookies[0]}'+he[he.find(';go('):]
hhdddd = execjs.compile(cookies) cookies_t_2 = '__jsl_clearance='+hhdddd.eval('cookies')[0]+';' headers['Cookie'] = cookies_t_2 + ';' + cookie_s
rep_3 = requests.get(url, headers=headers) print('第三次请求', rep_3.status_code, 'cookies 为', cookies_t_2)
|