Downloading Multiple Files in Python
·2 mins
Table of Contents
Requests #
The most used http api library in python would be requests.
To download files using it, I would use get
and store the content
into a file.
r = requests.get(url)
with open(file_name, "wb") as f:
f.write(r.content)
But when downloading multiple files:
- For every file, a new request session is made.
- It’s synchronous
which makes the whole download time slower.
Requests using Session() #
In requests’ Advanced Usage, you can use Session()
to allow for connection pooling and reuse, which reduces the overhead of establishing new connections for each request.
s = requests.Session()
r = s.get(url)
with open("out/" + file_name, "wb") as f:
f.write(r.content)
Asyncio + Aiohttp #
But still, the speed difference is not notable. So asynchronous comes to play.
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[download_file_async(session, url) for url in urls])
# In download_file_async()
async with session.get(url) as r:
content = await r.read()
with open("out/" + file_name, "wb") as f:
f.write(content)
which reduces the whole download time significantly.
Note
You should use
You should use
await
in content = await r.read()
, otherwise, the read process becomes Stream and flop the download process.
Benchmark #
So I tested with a simple script downloading multiple images from Rick and Morty API.
requests | requests with Session | asyncio + aiohttp | |
---|---|---|---|
Avg. time (secs) |
7~8 | 2~4 | 0.5 |
Results varied time to time, but clearly asynchronous process did better job.
Script #
http https://rickandmortyapi.com/api/character | jq '.results[].image' > images.list
requests, requests with Session() #
import requests
import time
def download_file(url):
file_name = url.split("/")[-1]
print(file_name)
r = requests.get(url)
with open("out/" + file_name, "wb") as f:
f.write(r.content)
def session_download_file(s, url):
file_name = url.split("/")[-1]
print(file_name)
r = s.get(url)
with open("out/" + file_name, "wb") as f:
f.write(r.content)
with open("images.list", "r") as f:
urls = f.readlines()
start = time.time()
# for url in urls:
# download_file(url.replace("\"", "").replace("\n", ""))
s = requests.Session()
for url in urls:
session_download_file( s, url.replace("\"", "").replace("\n", ""))
end = time.time()
print(end - start)
Asyncio + Aiohttp #
import asyncio
import aiohttp
import time
async def download_file_async(session, url):
file_name = url.split("/")[-1]
print(file_name)
async with session.get(url) as r:
content = await r.read()
with open("out/" + file_name, "wb") as f:
f.write(content)
async def main():
with open("images.list", "r") as f:
urls = f.readlines()
start = time.time()
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[download_file_async(session, url.replace("\"", "").replace("\n", "")) for url in urls])
end = time.time()
print(end - start)
if __name__ == "__main__":
asyncio.run(main())