'How to speed up python data parsing?

I have such a task - i need to parse the site in the form of a taxonomy and save to csv, that is, upload 24,000 links, that is, I uploaded 800 links to a file, reading line by line, collecting data, among which I take the message, with it I get the desired text again through a request, I break on processes, but I'm not too good at it. Please see the code, how can I increase it? Maybe at the moment where I pick up links and follow them, is it better to do this through a file? Thanks

import csv

import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool  # This is a thread-based Pool
from multiprocessing import cpu_count


def crawlToCSV(URLrecord):
    teams = []
    level_1 = 'tc'
    level_3 = soap.find('title').text

    for i in soap.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
        t = i.find('div', {'class': 'entry-title'})
        iso = t.parent.get_text().strip()

        n = i.find('div', {'class': 'entry-summary'}).text
        iso_standart = f'{iso}{n}'
        stage = i.find('td', {'data-title': 'Stage'}).text
        data = {'level_1': level_1, 'level_3': level_3, 'stage': stage, 'iso_standart': iso_standart,
                }
        teams.append(data)

    for item in soap.find_all('tr', {'ng-show': 'uChecked || uChecked == null'}):
        t1 = item.find('div', {'class': 'entry-title'})
        iso1 = t1.parent.get_text().strip()

        n1 = item.find('div', {'class': 'entry-summary'}).text
        iso_standart1 = f'{iso1}{n1}\n'
        stage1 = item.find('td', {'data-title': 'Stage'}).text
        data1 = {'stage1': stage1, 'iso_standart1': iso_standart1}

        teams.append(data1)
    for j, item in enumerate(teams):
        item.update({'abstract': v})
    return teams

def save(teams,path):                           
    with open(path,'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(('level_1','level_3','stage','iso_standart'))

        for team in teams:
            writer.writerow((team['level_1'],team['level_3'],team['stage'],team['iso_standart']))

if __name__ == "__main__":
    fileName = "t24.txt"
    pool = Pool(cpu_count() * 2)  # Creates a Pool with cpu_count * 2 threads.
    with open(fileName) as f:
        for URLrecord in f.readlines():
            if len(URLrecord.strip()):
                q = urlopen(URLrecord)
                results = pool.map(crawlToCSV, f)
                soap = BeautifulSoup(q, "lxml")
                for i in soap.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
                    a1 = i.find('a').attrs['href']
                    if not a1.startswith('https'):
                        mm = f'https://www.iso.org{a1}'
                        # print(mm)
                        q2 = requests.get(mm)
                        result1 = q2.content
                        bs = BeautifulSoup(result1, 'html.parser')
                        abstract = bs.find('div', attrs={'itemprop': 'description'})
                        v = abstract.text if abstract else ''
                        teams = crawlToCSV(URLrecord)
                        save(teams,path)
                        for team in teams:
                            print(team)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'How to speed up python data parsing?

Sources

Related Questions