Beautiful

Something never changes, but something does.

简单的python爬虫

#简单的python爬虫,python3.6+,使用协程。

# -*- coding: utf-8 -*-
import os
import time
import asyncio
import aiohttp
from lxml import etree
import requests
import urllib.request


# noinspection PyBroadException
async def download(url, timeout_min):
    file_name = urllib.request.unquote(url.split('/')[-1])
    target_name = os.path.join('d:/xxx/', file_name)
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=60*timeout_min) as r:
                with open(target_name, 'wb') as f:
                    print(f'?[{file_name}]')
                    a_time = time.clock()
                    f.write(await r.read())
                    b_time = time.clock()
                    print(f'![{file_name}]:{b_time-a_time:.3f}s')
    except Exception as e:
        print(f'{file_name}:{repr(e)}')


def get_url_list():
    url_base = 'http://base_url/'
    html = requests.get(f'{url_base}xxx.html').content.decode('utf-8')
    result = []
    for i in etree.HTML(html).xpath("//*[@class='download']"):
        result += [f'{url_base}{x}' for x in i.xpath('@href')]
    return result


if __name__ == '__main__':
    begin_time = time.clock()
    url_list = get_url_list()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.gather(*[download(url, len(url_list)) for url in url_list]))
    loop.close()
    end_time = time.clock()
    print(f'cost time:{end_time-begin_time}')