- 『Python+JSON~』本 P222より。
- 関数 download_to_file(url, file) を独立させる考え方と、with open でバイナリで保存する方法はとてもよい
import requests, os, time, json
import urllib.parse
from bs4 import BeautifulSoup
# 初期設定
shodou_url = 'http://uta.pw/shodou/index.php?master'
save_dir = os.path.join(os.path.dirname(__file__), 'images')
logfile = 'image.json'
# 画像をダウンロード
def download_shodou(target_url):
if not os.path.exists(save_dir):
os.mkdir(save_dir)
html = requests.get(target_url).text
time.sleep(1)
soup = BeautifulSoup(html, 'html.parser')
a_div = soup.select('#contents_body > div')
if len(a_div) == 0:
print('[エラー] 要素の取得に失敗')
return
images = []
for img in a_div[0].find_all('img'):
src = img.attrs['src']
alt = img.attrs['alt']
a_url = urllib.parse.urljoin(target_url, src)
fname = os.path.join(save_dir, src.replace('/', '_'))
download_to_file(a_url, fname)
images.append({'title': alt, 'url': a_url, 'file': fname})
# images(JSON形式)をログとして出力(indent、ensure_~ないと見にくい)
with open(logfile, 'w', encoding='utf-8') as fp:
json.dump(images, fp, indent=4, ensure_ascii=False)
# 画像のDLも With Open でバイナリモードで1回ずつ保存すればよい
def download_to_file(url, file):
print('download:', url)
bin = requests.get(url).content
time.sleep(1)
with open(file, 'wb') as fp:
fp.write(bin)
if __name__ == '__main__':
download_shodou(shodou_url)