[python 爬虫]通过回调函数爬取数据并且下载下来!

大家还记得我们以前的那段把所有index都下载下来的那段代码吗?

import requests
import re
import time
from urllib.parse import urljoin
import lxml.html
import cssselect
import csv
def download(url, user_agent='wswp', num_retries = 2):
    print('downloading:', url)
    time.sleep(0.5)
    headers = {'User-Agent': user_agent}
    html = requests.get(url, headers = headers)
    if html.status_code != requests.codes.ok:
        e = html.status_code
        print('Download error:', e)
        if num_retries > 0:
            if 500 <= e < 600:
                return download(url, num_retries-1)
    return html

def link_crawler(seed_url, link_regex):#regex里面是正则表达式需要抓取的页面
    crawl_queue = [seed_url]
    #重复链接检查:
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url).text
        links = []
        links.extend(ScrapeCallback(url, html, fields) or [])
        links.extend(link for link in get_links(html) if re.match(link_regex, link))
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urljoin(seed_url, link)
                #确定网址以前没有下载过
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    return re.findall(r"(?&lt;=href=").+?(?=")|(?&lt;=href=\').+?(?=\')", html)
def scrape_callback(url, html):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = []
        row.append(tree.cssselect('tr#places_population__row &gt; td.w2p_fw')[0].text_content())
    print(row)
    return([url])

link_crawler('http://example.webscraping.com', '/places/default/view/')

光打印出来是没有用的,以下这段代码重新定义了回调函数,生成了一个表格,十分好用!
接线来就是完整的代码了:在这段代码中,需要引用csv写入数据,完美生成countries.csv文件

import requests
import re
import time
from urllib.parse import urljoin
import lxml.html
import cssselect
import csv
def download(url, user_agent='wswp', num_retries = 2):
    print('downloading:', url)
    time.sleep(0.5)
    headers = {'User-Agent': user_agent}
    html = requests.get(url, headers = headers)
    if html.status_code != requests.codes.ok:
        e = html.status_code
        print('Download error:', e)
        if num_retries > 0:
            if 500 <= e < 600:
                return download(url, num_retries-1)
    return html

def link_crawler(seed_url, link_regex):#regex里面是正则表达式需要抓取的页面
    crawl_queue = [seed_url]
    #重复链接检查:
    seen = set(crawl_queue)
    with open('countries.csv','a') as f:
        writer = csv.writer(f)
        fields = ('area', 'population','iso','country','capital','continent','tld','currency_code','currency_name','phone','postal_code_format','postal_code_regex','languages','neighbours')
        writer.writerow(fields)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url).text
        links = []
        links.extend(ScrapeCallback(url, html, fields) or [])
        links.extend(link for link in get_links(html) if re.match(link_regex, link))
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urljoin(seed_url, link)
                #确定网址以前没有下载过
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)

def get_links(html):
    return re.findall(r"(?<=href=").+?(?=")|(?<=href=\').+?(?=\')", html)

def ScrapeCallback(url,html,fields):
    if re.search('/view/', url):
        tree = lxml.html.fromstring(html)
        row = []
        with open('countries.csv', 'a+', newline='') as f:
            writer = csv.writer(f)
            for field in fields:
                row.append(tree.cssselect('tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
            writer.writerow(row)

    return([url])



link_crawler('http://example.webscraping.com', '/places/default/view/|/places/default/index/')

以下是生成的程序:

这段代码是根据《用python写网络爬虫》这本书改编,但是里面的代码有很多错误,笔者查阅资料一一更改才有的这个可以运行的版本

Leave a Reply

Your email address will not be published. Required fields are marked *