banner
andrewji8

Being towards death

Heed not to the tree-rustling and leaf-lashing rain, Why not stroll along, whistle and sing under its rein. Lighter and better suited than horses are straw sandals and a bamboo staff, Who's afraid? A palm-leaf plaited cape provides enough to misty weather in life sustain. A thorny spring breeze sobers up the spirit, I feel a slight chill, The setting sun over the mountain offers greetings still. Looking back over the bleak passage survived, The return in time Shall not be affected by windswept rain or shine.
telegram
twitter
github

网站克隆爬虫(附Python代码)

import argparse
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import zipfile
import re

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
SAVE_DIR = 'sites'

def make_request(url, refer='', proxy_url=''):
    """Make HTTP request with optional refer and proxy."""
    headers = {
        'User-Agent': USER_AGENT
    }
    if refer:
        headers['Referer'] = refer
    if proxy_url:
        url = proxy_url + url
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None

def is_file_link(url):
    """Check if URL points to a file."""
    path = urlparse(url).path
    return bool(re.search(r'\.\w+$', path))

def save_file(file_url, base_url, save_dir, refer='', proxy_url=''):
    """Download and save file locally."""
    if file_url.startswith('//'):
        full_url = urlparse(base_url).scheme + ':' + file_url
    elif file_url.startswith(('http://', 'https://')):
        full_url = file_url
    else:
        full_url = urljoin(base_url.rstrip('/') + '/', file_url.lstrip('/'))

    parsed_url = urlparse(full_url)
    path = parsed_url.path
    extension = os.path.splitext(path)[1][1:] or 'unknown'
    filename = os.path.basename(path)

    extension_dir = os.path.join(save_dir, extension)
    os.makedirs(extension_dir, exist_ok=True)
    file_path = os.path.join(extension_dir, filename)

    try:
        response = requests.get(full_url, headers={'User-Agent': USER_AGENT, 'Referer': refer} if refer else {'User-Agent': USER_AGENT})
        response.raise_for_status()

        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Saved file: {full_url}")
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"Error saving file {full_url}: {e}")
        return None

def process_file_tags(soup, base_url, save_dir, refer, proxy_url):
    """Process HTML tags containing file references."""
    file_tags = {
        'script': 'src',
        'link': 'href',
        'img': 'src',
        'audio': 'src',
        'video': 'src',
        'source': 'src',
        'img': 'data-original'
    }
    modified_content = str(soup)
    for tag, attribute in file_tags.items():
        elements = soup.find_all(tag, attrs={attribute: True})
        for element in elements:
            file_url = element[attribute]
            print(f"Detected link: {file_url}")

            if is_file_link(file_url):
                new_file_path = save_file(file_url, base_url, save_dir, refer, proxy_url)
                if new_file_path:
                    extension = os.path.splitext(new_file_path)[1][1:]
                    relative_path = f'./{extension}/{os.path.basename(new_file_path)}'
                    modified_content = modified_content.replace(file_url, relative_path)
            else:
                print(f"Skipping link: {file_url}")

    return modified_content

def zip_directory(source_dir, zip_path):
    """Create ZIP archive of downloaded files."""
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(source_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_dir)
                zipf.write(file_path, arcname)

def main():
    parser = argparse.ArgumentParser(description='Website Mirror Tool')
    parser.add_argument('url', help='URL to mirror')
    parser.add_argument('--refer', help='Referrer URL', default='')
    parser.add_argument('--proxy', help='Proxy URL', default='')
    args = parser.parse_args()

    parsed_url = urlparse(args.url)
    host = parsed_url.netloc.replace('.', '')
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    save_dir = os.path.join(SAVE_DIR, host)
    os.makedirs(save_dir, exist_ok=True)

    html_content = make_request(args.url, args.refer, args.proxy)
    if not html_content:
        print("Failed to retrieve webpage")
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    modified_content = process_file_tags(soup, base_url, save_dir, args.refer, args.proxy)

    html_file_path = os.path.join(save_dir, f"{host}.html")
    with open(html_file_path, 'w', encoding='utf-8') as f:
        f.write(modified_content)
    print(f"Saved HTML file: {html_file_path}")

    zip_path = os.path.join(SAVE_DIR, f"{host}.zip")
    zip_directory(save_dir, zip_path)
    print(f"Created ZIP archive: {zip_path}")

if __name__ == '__main__':
    main()

主要功能:
在当前目录下创建一个 sites 文件夹
在 sites 文件夹中按照域名创建子文件夹
下载的文件会按类型分类存放 (如 images 文件夹存放图片,css 文件夹存放样式表等)
自动修改 HTML 文件中的链接,指向本地文件
最后生成一个 ZIP 压缩包,包含所有下载的文件

例如,如果你下载 https://example.com:
HTML 文件会保存为 sites/examplecom/examplecom.htm
所有文件会压缩为 sites/examplecom.zip
图片会保存在 sites/examplecom/images/ 目录
CSS 文件会保存在 sites/examplecom/css/ 目录
JavaScript 文件会保存在 sites/examplecom/js/ 目录

运行时会在控制台显示
: 检测到的文件链接
下载保存的文件路径
最终生成的 HTML 和 ZIP 文件位置

加载中...
此文章数据所有权由区块链加密技术和智能合约保障仅归创作者所有。