PYTHON笔记七:通过PYTHON爬取邮箱地址并写入SQL文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
from bs4 import BeautifulSoup
import re

url = "https://pubmed.ncbi.nlm.nih.gov/"

page = 1
pagelist = list()

for page_id in range(1):
page = page + 1
pagelist.append("&page="+str(page))
#print(pagelist)

urllist = list()
urllist.append(url)
for eachpage in pagelist:
urllist.append(url+eachpage)
# print(url+eachpage)
#print(urllist)
print(len(urllist),"pages were retrieved, ok!")

lstA = list()
for eachurl in urllist:
print("Url:",eachurl)
resp = requests.get(eachurl,headers=headers)
# print(resp.status_code)
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text,"html.parser")
tags = soup("button")

for tag in tags:
if tag.get("data-permalink-url") and not tag.get("data-permalink-url") in lstA:
lstA.append(tag.get("data-permalink-url"))
#print(lstA)
#print(len(lstA))
lstB = list()
for link in lstA:
url = link
resp = requests.get(url,headers=headers)
# print(resp.status_code)
resp.encoding = "utf-8"
#print(resp.text)
soup = BeautifulSoup(resp.text,"html.parser")
tags = soup("li")
for tag in tags:
if tag.get("data-affiliation-id"):
lstB.append(tag.contents[1])
#print(lstB)
#print(len(lstB))
maillist = list()
for text in lstB:
if re.search("@",text):
emails = re.findall("\S+@\S+\.[a-zA-Z0-9]+",text)
for email in emails:
if email in maillist:continue
else:maillist.append(email)
print(maillist)
print(len(maillist),"email address were retrived, done")

import sqlite3

conn = sqlite3.connect('test.sqlite')
cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS EmailList')

cur.execute('''
CREATE TABLE EmailList (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
email TEXT UNIQUE,
firstname TEXT,
lastname TEXT,
organization TEXT,
sent INTEGER
)''')

for email in maillist:
cur.execute('''INSERT OR IGNORE INTO EmailList (email)
VALUES ( ? )''', ( email, ) )
# cur.execute('SELECT id FROM EmailList WHERE email = ? ', (email, ))
# email_id = cur.fetchone()[0]
conn.commit()
  • 本文作者:括囊无誉
  • 本文链接: python-7-get-email/
  • 版权声明: 本博客所有文章均为原创作品,转载请注明出处!
------ 本文结束 ------
坚持原创文章分享,您的支持将鼓励我继续创作!