ウェブサイトクローンクローラー（Pythonコード付き）

import argparse
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import zipfile
import re

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
SAVE_DIR = 'sites'

def make_request(url, refer='', proxy_url=''):
    """オプションのリファラーとプロキシを使用してHTTPリクエストを作成します。"""
    headers = {
        'User-Agent': USER_AGENT
    }
    if refer:
        headers['Referer'] = refer
    if proxy_url:
        url = proxy_url + url
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"リクエスト中にエラーが発生しました: {e}")
        return None

def is_file_link(url):
    """URLがファイルを指しているかどうかを確認します。"""
    path = urlparse(url).path
    return bool(re.search(r'\.\w+$', path))

def save_file(file_url, base_url, save_dir, refer='', proxy_url=''):
    """ファイルをダウンロードしてローカルに保存します。"""
    if file_url.startswith('//'):
        full_url = urlparse(base_url).scheme + ':' + file_url
    elif file_url.startswith(('http://', 'https://')):
        full_url = file_url
    else:
        full_url = urljoin(base_url.rstrip('/') + '/', file_url.lstrip('/'))

    parsed_url = urlparse(full_url)
    path = parsed_url.path
    extension = os.path.splitext(path)[1][1:] or 'unknown'
    filename = os.path.basename(path)

    extension_dir = os.path.join(save_dir, extension)
    os.makedirs(extension_dir, exist_ok=True)
    file_path = os.path.join(extension_dir, filename)

    try:
        response = requests.get(full_url, headers={'User-Agent': USER_AGENT, 'Referer': refer} if refer else {'User-Agent': USER_AGENT})
        response.raise_for_status()

        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"ファイルを保存しました: {full_url}")
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"ファイルを保存中にエラーが発生しました {full_url}: {e}")
        return None

def process_file_tags(soup, base_url, save_dir, refer, proxy_url):
    """ファイル参照を含むHTMLタグを処理します。"""
    file_tags = {
        'script': 'src',
        'link': 'href',
        'img': 'src',
        'audio': 'src',
        'video': 'src',
        'source': 'src',
        'img': 'data-original'
    }
    modified_content = str(soup)
    for tag, attribute in file_tags.items():
        elements = soup.find_all(tag, attrs={attribute: True})
        for element in elements:
            file_url = element[attribute]
            print(f"検出されたリンク: {file_url}")

            if is_file_link(file_url):
                new_file_path = save_file(file_url, base_url, save_dir, refer, proxy_url)
                if new_file_path:
                    extension = os.path.splitext(new_file_path)[1][1:]
                    relative_path = f'./{extension}/{os.path.basename(new_file_path)}'
                    modified_content = modified_content.replace(file_url, relative_path)
            else:
                print(f"リンクをスキップします: {file_url}")

    return modified_content

def zip_directory(source_dir, zip_path):
    """ダウンロードしたファイルのZIPアーカイブを作成します。"""
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(source_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_dir)
                zipf.write(file_path, arcname)

def main():
    parser = argparse.ArgumentParser(description='ウェブサイトミラーリングツール')
    parser.add_argument('url', help='ミラーリングするURL')
    parser.add_argument('--refer', help='リファラーURL', default='')
    parser.add_argument('--proxy', help='プロキシURL', default='')
    args = parser.parse_args()

    parsed_url = urlparse(args.url)
    host = parsed_url.netloc.replace('.', '')
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    save_dir = os.path.join(SAVE_DIR, host)
    os.makedirs(save_dir, exist_ok=True)

    html_content = make_request(args.url, args.refer, args.proxy)
    if not html_content:
        print("ウェブページの取得に失敗しました")
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    modified_content = process_file_tags(soup, base_url, save_dir, args.refer, args.proxy)

    html_file_path = os.path.join(save_dir, f"{host}.html")
    with open(html_file_path, 'w', encoding='utf-8') as f:
        f.write(modified_content)
    print(f"HTMLファイルを保存しました: {html_file_path}")

    zip_path = os.path.join(SAVE_DIR, f"{host}.zip")
    zip_directory(save_dir, zip_path)
    print(f"ZIPアーカイブを作成しました: {zip_path}")

if __name__ == '__main__':
    main()

主要機能：
現在のディレクトリに sites フォルダを作成します
sites フォルダ内にドメイン名に基づくサブフォルダを作成します
ダウンロードされたファイルはタイプ別に分類されて保存されます（例：images フォルダには画像、css フォルダにはスタイルシートなど）
HTML ファイル内のリンクを自動的に修正し、ローカルファイルを指すようにします
最後に、すべてのダウンロードされたファイルを含む ZIP 圧縮ファイルを生成します

例えば、https://example.com をダウンロードする場合：
HTML ファイルは sites/examplecom/examplecom.htm に保存されます
すべてのファイルは sites/examplecom.zip に圧縮されます
画像は sites/examplecom/images/ ディレクトリに保存されます
CSS ファイルは sites/examplecom/css/ ディレクトリに保存されます
JavaScript ファイルは sites/examplecom/js/ ディレクトリに保存されます

実行中はコンソールに表示されます
: 検出されたファイルリンク
ダウンロード保存されたファイルのパス
最終的に生成された HTML および ZIP ファイルの位置