Python appears RuntimeError error

created at 06-26-2021 views: 1
RuntimeError: 
        An attempt has been made to start a new process before the
        current process has finished its bootstrapping phase.

        This probably means that you are not using fork to start your
        child processes and you have forgotten to use the proper idiom
        in the main module:

            if __name__ == '__main__':
                freeze_support()
                ...

        The "freeze_support()" line can be omitted if the program
        is not going to be frozen to produce an executable.

The above is the error explanation

Below is an original code that appears error code

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://google.com/"

#crawl
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()

#parse
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url

unseen = set([base_url])
seen = set()
restricted_crawl = True

pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]      # request connection

    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]    # parse html

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!

This is the correct code after modification

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://google.com/"

#crawl
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()

#parse
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url

def main():
    unseen = set([base_url])
    seen = set()
    restricted_crawl = True

    pool = mp.Pool(4)
    count, t1 = 1, time.time()
    while len(unseen) != 0:                 # still get some url to visit
        if restricted_crawl and len(seen) > 20:
                break
        print('\nDistributed Crawling...')
        crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
        htmls = [j.get() for j in crawl_jobs]      # request connection

        print('\nDistributed Parsing...')
        parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
        results = [j.get() for j in parse_jobs]    # parse html

        print('\nAnalysing...')
        seen.update(unseen)         # seen the crawled
        unseen.clear()              # nothing unseen

        for title, page_urls, url in results:
            print(count, title, url)
            count += 1
            unseen.update(page_urls - seen)     # get new url to crawl
    print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!


if __name__ == '__main__':
    main()

In summary, it is to integrate your run code into a function, then join

if __name__ == '__main__':
    main()

This line of code can solve this problem.
Specific principle can look at this knowledge introduction

Please log in to leave a comment.