for tag in tags: if tag.get("data-permalink-url") and not tag.get("data-permalink-url") in lstA: lstA.append(tag.get("data-permalink-url")) #print(lstA) #print(len(lstA)) lstB = list() for link in lstA: url = link resp = requests.get(url,headers=headers) # print(resp.status_code) resp.encoding = "utf-8" #print(resp.text) soup = BeautifulSoup(resp.text,"html.parser") tags = soup("li") for tag in tags: if tag.get("data-affiliation-id"): lstB.append(tag.contents[1]) #print(lstB) #print(len(lstB)) maillist = list() for text in lstB: if re.search("@",text): emails = re.findall("\S+@\S+\.[a-zA-Z0-9]+",text) for email in emails: if email in maillist:continue else:maillist.append(email) print(maillist) print(len(maillist),"email address were retrived, done")
import sqlite3
conn = sqlite3.connect('test.sqlite') cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS EmailList')
cur.execute(''' CREATE TABLE EmailList ( id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, email TEXT UNIQUE, firstname TEXT, lastname TEXT, organization TEXT, sent INTEGER )''')
for email in maillist: cur.execute('''INSERT OR IGNORE INTO EmailList (email) VALUES ( ? )''', ( email, ) ) # cur.execute('SELECT id FROM EmailList WHERE email = ? ', (email, )) # email_id = cur.fetchone()[0] conn.commit()