自动爬取SPECS化合物库中所有的化合物文件,mol格式的文件。
下一步获得SPECS中化合物的cas号做准备。
```perl
#!python27
#coding: utf-8
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import requests
dcap = dict(DesiredCapabilities.PHANTOMJS) #设置userAgent
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 ")
driver = webdriver.PhantomJS(executable_path='d:Python27Scriptsphantomjs.exe',desired_capabilities=dcap) #加载网址
driver.maximize_window() # 浏览器全屏显示
driver.set_page_load_timeout(15) #设置页面完全加载的超时时间
# LOGIN SUCCESS
try:
driver.get("http://www.specs.net")
user_elem=driver.find_element_by_xpath('//form[@name="loginform"]/table/tbody/tr[3]/td[2]/input')
driver.save_screenshot('1.png')
user_elem.send_keys(u'zqchen')
driver.save_screenshot('2.png')
pwd_elem=driver.find_element_by_xpath('//form[@name="loginform"]/table/tbody/tr[4]/td[2]/input')
pwd_elem.send_keys(u"wlj45s")
driver.save_screenshot('3.png')
login_elem=driver.find_element_by_xpath('//form[@name="loginform"]/table/tbody/tr[5]/td[2]/input')
login_elem.click()
time.sleep(3)
print driver.title
driver.save_screenshot('4.png')
print "login success"
except Exception as e:
print e
# GET MOL FILES
download_link="https://www.specs.net/mol.php?structureId=AA-173/40757587"
session = requests.Session()
cookies = driver.get_cookies()
for cookie in cookies:
session.cookies.set(cookie['name'], cookie['value'])
response = session.get(download_link)
print response.content
```