PYTHON笔记六：通过PYTHON获取多个PUBMED搜索页面的邮箱地址

本文记录如何构建多个搜索页面的URL，并使用FOR IN循环来获取每篇文章中的邮箱地址，代码如下：

import requests
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
from bs4 import BeautifulSoup
import re

url = "XXX" #本文中使用的地址是PUBMED

page = 1
pagelist = list()

for page_id in range(1):
    page = page + 1
    pagelist.append("&page="+str(page))
#print(pagelist)

urllist = list()
urllist.append(url)
for eachpage in pagelist:
    urllist.append(url+eachpage)
#    print(url+eachpage)
#print(urllist)
print(len(urllist),"pages were retrieved, ok!")

lstA = list()
for eachurl in urllist:
    print("Url:",eachurl)
    resp = requests.get(eachurl,headers=headers)
#    print(resp.status_code)
    resp.encoding = "utf-8"
    soup = BeautifulSoup(resp.text,"html.parser")
    tags = soup("button")

    for tag in tags:
        if tag.get("data-permalink-url") and not tag.get("data-permalink-url") in lstA:
            lstA.append(tag.get("data-permalink-url"))
#print(lstA)
#print(len(lstA))
lstB = list()
for link in lstA:
    url = link
    resp = requests.get(url,headers=headers)
#    print(resp.status_code)
    resp.encoding = "utf-8"
    #print(resp.text)
    soup = BeautifulSoup(resp.text,"html.parser")
    tags = soup("li")
    for tag in tags:
        if tag.get("data-affiliation-id"):
            lstB.append(tag.contents[1])
#print(lstB)
#print(len(lstB))
maillist = list()
for text in lstB:
    if re.search("@",text):
        emails = re.findall("\S+@\S+\.[a-zA-Z0-9]+",text)
        for email in emails:
            if email in maillist:continue
            else:maillist.append(email)
#print(maillist)
print(len(maillist),"email address were retrived, done")

代码运行结果：

1	17 email address were retrived, done