|||
import re
import urllib
from bs4 import BeautifulSoup
url = "http://journals.plos.org/plosone/article?id=info%3Adoi/10.1371/journal.pone.0162069"
response = urllib.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, "lxml")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = 'n'.join(chunk for chunk in chunks if chunk)
#print(text)
#This command will create the ouput.txt file for you.
output = open("ouput.txt","a+")
#The format of text is unicode.
output.write(text.encode("utf-8"))
output.close()
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-11-23 11:59
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社