import os
import xml.etree.ElementTree as ET
import requests
from tqdm import tqdm

# -----------------------------
# CONFIGURATION
# -----------------------------
XML_FILE = "bucket.xml"
BASE_URL = "http://download.gl-inet.com.s3.amazonaws.com/"
OUTPUT_DIR = "downloads"

# -----------------------------
# PARSE XML + HANDLE NAMESPACE
# -----------------------------
tree = ET.parse(XML_FILE)
root = tree.getroot()

# Detect namespace automatically
ns = ""
if root.tag.startswith("{"):
    ns = root.tag.split("}")[0] + "}"

def find_text(elem, tag):
    child = elem.find(f"{ns}{tag}")
    return child.text if child is not None else None

# -----------------------------
# DOWNLOAD WITH PROGRESS BAR
# -----------------------------
def download_with_progress(url, local_path):
    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    response = requests.get(url, stream=True)
    if response.status_code != 200:
        print(f"Failed: {url} (HTTP {response.status_code})")
        return

    total = int(response.headers.get("content-length", 0))

    with open(local_path, "wb") as f, tqdm(
        total=total,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
        desc=os.path.basename(local_path),
    ) as bar:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))

# -----------------------------
# PROCESS EACH <Contents>
# -----------------------------
for contents in root.findall(f".//{ns}Contents"):
    key = find_text(contents, "Key")
    if not key:
        continue

    # Skip S3 "directory" placeholder keys
    if key.endswith("/"):
        continue

    url = BASE_URL + key
    local_path = os.path.join(OUTPUT_DIR, key)

    # Skip files already downloaded
    if os.path.exists(local_path):
        print(f"Skipping (already exists): {local_path}")
        continue

    print(f"Downloading: {url}")
    download_with_progress(url, local_path)

