BBSダウンローダー

P250
urllib.parse.urljoin(A, B) でURLA＋Bを作成できる
import requests, os, time, json
import urllib.parse
from bs4 import BeautifulSoup

# 初期設定
BBS_URL = 'https://nadesi.com/cgi/bug/index.php'
MAX_PAGES = 5
LOGFILE = 'bbs_logs.json'
logs = []
pages = []

# 掲示板にアクセスしてデータを取り出す
def get_logs(target_url):

    # 最大ページ数の確認
    if len(pages) > MAX_PAGES:
        return # 最大ページ数を超えたら戻る
    # 二重にページを取得していないかチェック（同じデータ、同じもの）
    if target_url in pages:
        return # すでにDL済みなら戻る
    pages.append(target_url)

    # HTMLをDL
    html = requests.get(target_url).text
    time.sleep(1)

    # HTMLを解析
    soup = BeautifulSoup(html, 'html.parser')

    # 掲示板のログデータを抽出
    for row in soup.select('#body div.thread > table tr'):
        # trの下のtd要素を抽出
        td_list = list(row.children)
        # ログページのURLを取得
        a = td_list[0].find('a')
        if a is None:
            continue
        # ログのURLを絶対URLに変換
        href = a.attrs['href']
        print(f'{href=}')
        href = urllib.parse.urljoin(target_url, href)
        print(f'{href=}')

        # ログの各種情報を辞書型に入れる
        info = {
            'id': td_list[0].text,
            'title': td_list[1].text,
            'date': td_list[3].text,
            'priority': td_list[4].text,
            'status': td_list[5].text,
            'link': href,
        }
        print(info['id'], info['title'], info['link'])
        # ログに追加
        logs.append(info)

    # 次へボタンのリンクを求める
    for e in soup.select('.pager > a'):
        if e.text != '次へ→':
            continue
        link = e.attrs['href']
        # リンクを絶対URLに変換
        link = urllib.parse.urljoin(target_url, link)
        # 再帰的に掲示板の内容をダウンロード
        get_logs(link)

def save_logs():
    # ログの内容をファイルに保存
    with open(LOGFILE, 'w', encoding='utf-8') as fp:
        json.dump(logs, fp, indent=4, ensure_ascii=False)
    print('ログの数: ', len(logs))

if __name__ == '__main__':
    get_logs(BBS_URL)
    save_logs()
#
月	火	水	木	金	土	日
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31