|||
import re
import urllib
from bs4 import BeautifulSoup
url = "http://journals.plos.org/plosone/article?id=info%3Adoi/10.1371/journal.pone.0162069"
response = urllib.urlopen(url)
page = response.read()
soup = BeautifulSoup(page, "lxml")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
#lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
#chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
#text = 'n'.join(chunk for chunk in chunks if chunk)
#print(text)
#print type(soup)
#print soup.prettify()
#You can change the regex if it doesn't work properly.
pattern = re.compile(r"(?<=s).{0,2}w*([A-Z]{2}|([A-Z]w[A-Z]))w*.{0,2}(?=s)")
result_list1 = pattern.findall(text)
#Delete repeated elements.
result_set = set(result_list1)
result_list2 = list(result_set)
#结果暂时不理想。熟练掌握bs4以后再修改
print result_list2
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-11-23 11:15
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社