'How to speed up python data parsing?
I have such a task - i need to parse the site in the form of a taxonomy and save to csv, that is, upload 24,000 links, that is, I uploaded 800 links to a file, reading line by line, collecting data, among which I take the message, with it I get the desired text again through a request, I break on processes, but I'm not too good at it. Please see the code, how can I increase it? Maybe at the moment where I pick up links and follow them, is it better to do this through a file? Thanks
import csv
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
def crawlToCSV(URLrecord):
teams = []
level_1 = 'tc'
level_3 = soap.find('title').text
for i in soap.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
t = i.find('div', {'class': 'entry-title'})
iso = t.parent.get_text().strip()
n = i.find('div', {'class': 'entry-summary'}).text
iso_standart = f'{iso}{n}'
stage = i.find('td', {'data-title': 'Stage'}).text
data = {'level_1': level_1, 'level_3': level_3, 'stage': stage, 'iso_standart': iso_standart,
}
teams.append(data)
for item in soap.find_all('tr', {'ng-show': 'uChecked || uChecked == null'}):
t1 = item.find('div', {'class': 'entry-title'})
iso1 = t1.parent.get_text().strip()
n1 = item.find('div', {'class': 'entry-summary'}).text
iso_standart1 = f'{iso1}{n1}\n'
stage1 = item.find('td', {'data-title': 'Stage'}).text
data1 = {'stage1': stage1, 'iso_standart1': iso_standart1}
teams.append(data1)
for j, item in enumerate(teams):
item.update({'abstract': v})
return teams
def save(teams,path):
with open(path,'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('level_1','level_3','stage','iso_standart'))
for team in teams:
writer.writerow((team['level_1'],team['level_3'],team['stage'],team['iso_standart']))
if __name__ == "__main__":
fileName = "t24.txt"
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads.
with open(fileName) as f:
for URLrecord in f.readlines():
if len(URLrecord.strip()):
q = urlopen(URLrecord)
results = pool.map(crawlToCSV, f)
soap = BeautifulSoup(q, "lxml")
for i in soap.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
a1 = i.find('a').attrs['href']
if not a1.startswith('https'):
mm = f'https://www.iso.org{a1}'
# print(mm)
q2 = requests.get(mm)
result1 = q2.content
bs = BeautifulSoup(result1, 'html.parser')
abstract = bs.find('div', attrs={'itemprop': 'description'})
v = abstract.text if abstract else ''
teams = crawlToCSV(URLrecord)
save(teams,path)
for team in teams:
print(team)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|