python 爬取https的登錄界面,怎麼爬取成功,謝謝

之前寫的一直沒成功,原因是用的不是HTTPS相關的函數。這次仔細研究了一下,有幾個需要注意的點,一個是POST模擬登陸的時候,header中的cookie值,不同的網站應該會有不同的要求;另一個是GET頁面的時候,是需要加上POST得到的response中的set-cookie的。這樣才能利用登陸的成功。

寫完POST和GET頁面後,順便寫了個簡單的命令行實現。

importhttplib,urllib
importurllib2
importcookielib
importsys

file_text="build_change.txt"
resultTable=dict()
host='buuuuuuu.knight.com'

defLogin(username,password,csrf=''):
url='/login/'
values={
'username':username,
'password':password,
'next':'',
'csrfmiddlewaretoken':csrf,
}

headers={
'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(Khtml,likeGecko)Chrome/35.0.1916.114Safari/537.36',
'Content-Type':'application/x-www-form-urlencoded',
'Connection':'keep-alive',
'Cookie':'csrftoken=%s'%csrf,
'Referer':'https://buuuuuuu.knight.com/login/',
'Origin':'https://buuuuuuu.knight.com',
'Content-Type':'application/x-www-form-urlencoded',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
values=urllib.urlencode(values)
conn=httplib.HTTPSConnection(host,443)
conn.request("POST",url,values,headers)
response=conn.getresponse()
print'Login:',response.status,response.reason
'''
hdata=response.getheaders()
foriinxrange(len(hdata)):
forjinxrange(len(hdata[i])):
printhdata[i][j],
print
'''
returnresponse.getheader("set-cookie")


defGetHtml(_url,cookie):
get_headers={
'Host':'xxxxx.knight.com',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Cookie':cookie,
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/35.0.1916.114Safari/537.36',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
}
conn=httplib.HTTPSConnection(host)
conn.request("GET",_url,None,get_headers)
res2=conn.getresponse()
print"Get%s:"%_url,res2.status,res2.reason
'''
hdata1=res2.getheaders()
foriinxrange(len(hdata1)):
forjinxrange(len(hdata1[i])):
printhdata1[i][j],
print
'''
data=res2.read()
fp=open("build_change.txt","w")
fp.write(data)
fp.close()


defParseHtml():
fp=open(file_text,"r")
content=fp.readline()
_pos=0
whilecontent:
ifcontent.find("class="change-body"")>=0:
topic=content.split(">")
resultTable[_pos]=topic[1]
whilecontent:
content=fp.readline()
resultTable[_pos]=resultTable[_pos]+content
ifcontent.find("</div>")>=0:
_pos=_pos+1
break
content=fp.readline()
fp.close()
print"Parsehtmlsuccess."


defGenerateResultTxt():
f=open("build_change_result.txt","w")
forminresultTable.keys():
f.write("------------------------------------------------------------------------------------------- ")
f.write(resultTable[m])
f.close()
print"Generateresultsuccess:build_change_result.txt."
defHelp():
print'-h:help'
print'-u:username(must)'
print'-p:password(must)'
print'-c:csrftoken(optional)'
print'-s:sandboxbuildid(must)'
print'Forexample:'
print'[1]pythonBuildChange.py-h'
print'[2]pythonBuildChang.py-uu-pp-ss1s2'
print'[3]pythonBuildChang.py-uu-pp-cc-ss1s2'


defParseParam(com):
length=len(com)
username=""
password=""
csrf=""
sid1=""
sid2=""
iflength==2orlength==8orlength==10:
ifcom[1]=='-h':
Help()
foriinrange(1,length):
ifcom[i]=='-u'andi<(length-1):
username=com[i+1]
i+=1
elifcom[i]=='-p'andi<(length-1):
password=com[i+1]
i+=1
elifcom[i]=='-c'andi<(length-1):
csrf=com[i+1]
i+=1
elifcom[i]=='-s'andi<(length-2):
sid1=com[i+1]
sid2=com[i+2]
i+=2
ifusername==""orpassword==""orsid1==""orsid2=="":
print'[Error]Parametererror!'
print'[Error]Youcanuse"pythonBuildChange.py-h"toseehowcanusethisscript.'
else:
ifcsrf=="":
cookie=Login(username,password)
else:
cookie=Login(username,password,csrf)
_url="//changelog//between//%s//and//%s/"%(sid1,sid2)
GetHtml(_url,cookie)
ParseHtml()
GenerateResultTxt()

#C:Python27python.exeC:UsersknightDesktopuildBuildChange.py-uxux-pKKKKKKKK-s18594091858525

if__name__=="__main__":
ParseParam(sys.argv)

⑵ python 獲取當前頁面的是http還是https

對象基本上就是一個字典把它轉換成一個包含元組的列表,然後用合適的內分隔符連接起來就好了容12345importrequestsresp=requests.get(url)cookies=resp.cookiesprint(';'.join(['='.join(item)for

⑶ python如何用代理服務的方式獲取https的請求和接受數據

你是在做webserver?
我測試沒問題,你看看是不是防火牆什麼的

⑷ 如何python中讀取https傳輸中的cookie

在response的header裡面,會有cookie的設置,response.headers['set-cookie']便可以看到

⑸ 如何 https python

用requests和urllib2報的錯誤信息是一樣,可見它們都是基於相同的底層api操作的,比如基於TLS的socket連接。到這里的時候我懷疑這個問題不是python代碼寫的有問題,可能是操作系統級別的設置錯了。

⑹ Python怎麼抓https的包

https 是加密的包,你抓了也沒用

⑺ Python登陸一個HTTPS網站時遇到問題

很多網站為了防止csrf,會在get某個網頁上的form的時候,加入一個token在cookie或者在form的hidden field裡面,然後在post的時專候屬,檢查有沒有這個token。你看看是不是因為這個。

⑻ python請求https怎麼搞

import requests

response=requests.get(httpsUrl,verify=False)

⑼ python requests怎麼處理https加密數據

有時為了方便,在發送請求時把驗SSL證書關掉, 設置verify為False,

import requests
from requests import Request, Session

for i in range (0,2):
s=requests.Session()
r1= s.get('httw..com/&', verify=False)
print r1

====

/usr/local/python-2.7/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.py:821: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
InsecureRequestWarning)
/usr/local/python-2.7/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.py:821: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
InsecureRequestWarning)
<Response [200]>

⑽ python 鏈接https安裝什麼軟體

今天寫代碼時碰到一個問題,花了幾個小時的時間google,

首先需要安裝openssl,更新到最新版本後,在瀏覽器里看是否可訪問,如果是可以的,所以應該不是openssl有問題。
然後使用 curl嘗試訪問

curl -v
可以查看ssl版本,如果不能訪問,嘗試更換ssl版本

1
2
3
4
5

curl
-1

curl
-2 h

curl
-3

分別用上面的三句腳本去測試連接情況,發現第三種可以連接正常(-1,2,3,數字分別代碼tlsv1,sslv2,sslv3三個不同的SSL版本)。說明這個https連接所在的伺服器是基於SSLV3版本的。找到的問題,就很容易知道怎麼改寫Python代碼了。

1
2
3
4
5
6
7
8
9

class MyAdapter(HTTPAdapter):
def init_poolmanager(self,
connections, maxsize):
self.poolmanager
= PoolManager(num_pools=connections,
maxsize=maxsize,
ssl_version=ssl.PROTOCOL_SSLv3)

s
= requests.Session()
s.mount('https://',
MyAdapter())#所有的https連接都用ssl.PROTOCOL_SSLV3去連接
s.get('')

urllib2實現:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

#
custom HTTPS opener, banner's oracle 10g server supports SSLv3 only
import httplib,
ssl, urllib2, socket
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self,
*args,
**kwargs):
httplib.HTTPSConnection.__init__(self,
*args,
**kwargs)

def connect(self):
sock
= socket.create_connection((self.host,
self.port),
self.timeout)
if self._tunnel_host:
self.sock
= sock
self._tunnel()
try:
self.sock
= ssl.wrap_socket(sock,
self.key_file,
self.cert_file,
ssl_version=ssl.PROTOCOL_SSLv3)
except ssl.SSLError,
e:
print("Trying
SSLv3.")
self.sock
= ssl.wrap_socket(sock,
self.key_file,
self.cert_file,
ssl_version=ssl.PROTOCOL_SSLv23)

class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self,
req):
return self.do_open(HTTPSConnectionV3,
req)
#
install opener
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))

if __name__
== "__main__":
r
= urllib2.urlopen("https://ui2web1.apps.uillinois.e/BANPROD1/bwskfcls.P_GetCrse")
print(r.read())

可以看到這兩種方案的原理都是一樣,就是自定義連接處理器,改變連接時ssl的版本號。