"""Convert DeepLabCut project to benchmark dataset
==================================================

Create a ``poseinterface`` benchmark dataset from a DeepLabCut (DLC) project.
"""

# %%
# Imports
# -------
import json
import shutil
import tempfile
from datetime import datetime, timezone
from pathlib import Path

import poseinterface
from poseinterface.clips import extract_clip
from poseinterface.io import (
    annotations_to_poseinterface,
    frames_to_poseinterface,
    predictions_to_poseinterface,
    video_to_poseinterface,
)
from poseinterface.utils import tree

# %%
# Overview
# --------
# We'll handle the conversion in two steps:
#
# 1. **Convert:** DLC project files (videos, frame annotations, and
#    keypoint predictions) are restructured into the
#    :ref:`poseinterface benchmark layout <target-benchmark-dataset>`.
# 2. **Extract clips:** Short video clips and their labels are extracted
#    from the converted videos and their corresponding keypoint
#    predictions, ready for expert review.
#
# .. figure:: /_static/DLC_to_poseinterface_worklow.svg
#    :alt: Workflow diagram showing how a DLC project is converted
#           to a poseinterface benchmark dataset
#    :align: center
#
#    High-level overview of the two-step conversion workflow.

# %%
# Source DLC project
# ------------------
# We work with a dataset from the
# `Sainsbury Wellcome Centre (SWC) <https://www.sainsburywellcome.org/>`_,
# produced by Loukia Katsouri from John O'Keefe's lab.
# It contains single-animal top-down videos of mice exploring an
# Elevated Plus Maze (EPM), analysed using
# `DeepLabCut (DLC) <https://www.mackenziemathislab.org/deeplabcut>`_.
#
# .. note::
#
#    This example runs against a lightweight fixture shipped with the
#    repository (under ``tests/data/``). This fixture contains only a subset
#    of the original DLC project, and is intended for testing and demonstration
#    purposes.
#
#    Replace ``source_project_dir`` and ``benchmark_base_dir`` with the paths
#    to your DLC project and benchmark dataset directories, respectively. Keep
#    in mind that your project will contain more files than are shown here.


source_project_dir = (
    Path(".").resolve().parent
    / "tests"
    / "data"
    / "dlc"
    / "MouseTopDown-Loukia-2022-09-13"
)
print(tree(source_project_dir, level=1, exclude_hidden=True))

# For this example we use a temporary directory, cleaned up at the end.
benchmark_base_dir = Path(tempfile.mkdtemp(prefix="poseinterface-benchmark-"))
print(f"\nBenchmark dataset will be saved to: {benchmark_base_dir}")

# %%
# The two source project sub-directories we care about are:
#
# - ``videos/``: the session videos and their corresponding prediction files.
# - ``labeled-data/``: sampled frames and their keypoint annotations.
#
# Let's peek inside each.

print(tree(source_project_dir / "videos", level=1, exclude_hidden=True))

# %%
# In ``videos/``, each video (ending in ``converted.mp4``) has a companion .csv
# prediction file.

print(tree(source_project_dir / "labeled-data", level=2, exclude_hidden=True))

# %%
# In ``labeled-data/``, the sub-directories mirror the video names (without
# .mp4) and contain the sampled frame images (.png) and their annotations
# (.csv). In real projects you may also find predictions and annotations in
# .h5 format, as well as filtered prediction files.

# %%
# Define sessions to convert
# ---------------------------
# We select two sessions from the DLC project and assign each to either
# the ``Train`` or ``Test`` split of the
# :ref:`benchmark dataset <target-benchmark-dataset>`.
# You may expand this list with more sessions, but ensure that each session
# belongs to exactly one split, and that the same subject doesn't appear in
# both splits (to avoid data leakage).
# All videos use the same top-down camera view (``cam-topdown``).

sessions = [
    {
        "split": "Train",
        "source_video": "M727755_EPM_20200317_170544999-converted.mp4",
        "sub_id": "M727755",
        "ses_id": "20200317",
        "cam_id": "topdown",
    },
    {
        "split": "Test",
        "source_video": "M708154_EPM_20200317_185651629-converted.mp4",
        "sub_id": "M708154",
        "ses_id": "20200317",
        "cam_id": "topdown",
    },
]

project_name = "SWC-plusmaze"

# %%
# Convert to benchmark format
# ----------------------------
# For each session we:
#
# 1. copy (and re-encode, if necessary) the session video;
# 2. convert DLC keypoint annotations to COCO JSON, as well as copy and
#    rename the corresponding frame images;
# 3. convert DLC keypoint predictions to COCO JSON.

for session in sessions:
    split = session["split"]
    ids = {k: session[k] for k in ["sub_id", "ses_id", "cam_id"]}
    sub_ses_prefix = f"sub-{ids['sub_id']}_ses-{ids['ses_id']}"
    sub_ses_cam_prefix = f"{sub_ses_prefix}_cam-{ids['cam_id']}"
    source_video_path = source_project_dir / "videos" / session["source_video"]
    source_frames_dir = (
        source_project_dir / "labeled-data" / source_video_path.stem
    )
    target_session_dir = (
        benchmark_base_dir / split / project_name / sub_ses_prefix
    )
    target_frames_dir = target_session_dir / "Frames"
    target_frames_dir.mkdir(parents=True, exist_ok=True)

    print(f"Converting session: {split}/{project_name}/{sub_ses_prefix}")
    # Copy the session video, re-encoding to H.264/yuv420p if necessary
    video_to_poseinterface(
        input_video=source_video_path,
        output_video_dir=target_session_dir,
        **ids,
    )
    print(f"\tvideo: {source_video_path.name} -> {sub_ses_cam_prefix}.mp4")

    # Convert DLC annotations to COCO frame labels JSON, then copy the
    # corresponding frame images with standardised poseinterface filenames.
    # In real projects there may be multiple annotation CSVs (e.g. for
    # different labelers); adjust the glob pattern to select the right one.
    source_annotations_path = next(
        source_frames_dir.glob("CollectedData_*.csv"),
        None,
    )
    if source_annotations_path is None:
        print(
            f"\tNo CollectedData CSV found in {source_frames_dir}."
            " Skipping annotations-to-poseinterface conversion."
        )
    else:
        framelabels_path = annotations_to_poseinterface(
            input_path=source_annotations_path,
            output_dir=target_frames_dir,
            format="frame",
            **ids,
        )
        frames_to_poseinterface(
            input_dir=source_frames_dir,
            output_dir=target_frames_dir,
            framelabels_path=framelabels_path,
        )
        print(
            f"\tannotations (+ frame images): {source_annotations_path.name} "
            f"-> {framelabels_path.name}"
        )

    # Convert DLC predictions to COCO video labels JSON for clip extraction.
    # In real projects there may be multiple prediction CSVs (e.g. filtered
    # versions); adjust the glob pattern to select the right one.
    source_predictions_path = next(
        (source_project_dir / "videos").glob(f"{source_video_path.stem}*.csv"),
        None,
    )
    if source_predictions_path is None:
        print(
            f"\tNo prediction CSV found for {source_video_path.stem!r} in "
            f"{source_project_dir / 'videos'}. Skipping predictions-to-"
            "poseinterface conversion."
        )
    else:
        predictions_to_poseinterface(
            input_path=source_predictions_path,
            video_path=source_video_path,
            output_dir=target_session_dir,
            **ids,
        )
        print(
            f"\tpredictions: {source_predictions_path.name} -> "
            f"{sub_ses_cam_prefix}_videolabels.json"
        )
    print("Done.\n")

# %%
# The resulting benchmark dataset:

print(tree(benchmark_base_dir, level=5))

# %%
# .. note::
#
#    Frame labels (``framelabels.json``) are generated for both splits, but in
#    the **published** dataset the ``Test`` split intentionally omits them for
#    evaluation. See the
#    :ref:`folder structure specification<target-dataset-folder-structure>` for
#    details.
#
#    The ``videolabels.json`` files generated alongside each session video are
#    intermediate artifacts used for clip extraction in the next section, and
#    will not be included in the published dataset.


# %%
# Extract clips
# -------------
# Clips (short video segments) can be extracted from the converted session
# videos. When the ``videolabels.json`` files are present, the corresponding
# clip label files (``cliplabels.json``) are generated automatically during
# clip extraction.
# These clip label files should then be proof-read and corrected by
# experts before being included in the benchmark dataset.
#
# First, we specify the clip-extraction parameters. This step can be repeated
# with different parameters to incrementally expand the clip set.

duration = 5  # in frames
start_frames = [25, 50, 75]
print(f"Extracting {duration}-frame clips starting at frames: {start_frames}")

# %%
# We loop over all sessions and extract clips at each start frame.
# The resulting video clips and their ``cliplabels.json`` files are saved
# in a ``Clips/`` subdirectory within each session folder.

for session in sessions:
    sub_ses_prefix = f"sub-{session['sub_id']}_ses-{session['ses_id']}"
    sub_ses_cam_prefix = f"{sub_ses_prefix}_cam-{session['cam_id']}"
    session_dir = (
        benchmark_base_dir / session["split"] / project_name / sub_ses_prefix
    )

    for start_frame in start_frames:
        clip_path, _ = extract_clip(
            video_path=session_dir / f"{sub_ses_cam_prefix}.mp4",
            start_frame=start_frame,
            duration=duration,
        )
        print(f"Extracted clip: {clip_path.stem}")


# %%
# The resulting benchmark dataset, including the extracted clips and their
# corresponding labels:

print(tree(benchmark_base_dir, level=5))


# %%
# .. note::
#
#    In the published dataset, the ``Train`` split includes all
#    ``cliplabels.json`` files. The ``Test`` split omits all
#    ``cliplabels.json`` files and instead provides only clip start labels
#    (``startlabels.json``), derived from each clip's first frame,
#    to support point-tracker evaluation.
#    The ``videolabels.json`` files generated in the previous section are
#    intermediate artifacts used for clip extraction, and are never shared.
#    See the :ref:`folder structure specification<target-dataset-folder-\
#    structure>` for details.


# %%
# Record provenance (optional)
# ----------------------------
# This step is optional and can be safely skipped, but it is highly recommended
# when converting real data, for book-keeping and reproducibility purposes.
#
# We save a copy of this script alongside a JSON sidecar with the
# ``poseinterface`` version (including git commit, via ``setuptools_scm``)
# and a UTC timestamp. Both files are written to a top-level
# ``.provenance/`` folder, named by project, so multiple projects under the
# same ``benchmark_base_dir`` stay distinct.

# sphinx_gallery_capture_repr = ()
provenance_dir = benchmark_base_dir / ".provenance"
provenance_dir.mkdir(parents=True, exist_ok=True)

# ``__file__`` is set when running this script directly with Python, but not
# when sphinx-gallery executes it during the docs build, guard accordingly.
script_path_str = globals().get("__file__")
if script_path_str:
    shutil.copy(Path(script_path_str), provenance_dir / f"{project_name}.py")

(provenance_dir / f"{project_name}.json").write_text(
    json.dumps(
        {
            "poseinterface_version": poseinterface.__version__,
            "converted_at": datetime.now(timezone.utc).isoformat(),
            "source_project_dir": str(source_project_dir),
        },
        indent=2,
    )
)

# %%
# Clean up
# --------
# Since this example writes to a temporary directory, we remove it at the end.
#
# .. warning::
#
#    Only run this cell when ``benchmark_base_dir`` points to a temporary
#    location. The guard below refuses to delete anything outside the system
#    temp directory, so it is safe to leave in place when you adapt this
#    example to a real benchmark dataset path.

system_tempdir = Path(tempfile.gettempdir()).resolve()
target = benchmark_base_dir.resolve()
if target.is_relative_to(system_tempdir) and target != system_tempdir:
    shutil.rmtree(target)
    print(f"Removed temporary benchmark directory: {target}")
else:
    print(
        f"Refusing to remove {target}: not inside system temp dir "
        f"({system_tempdir}). Delete manually if you really want to."
    )