from __future__ import annotations

import json
import re

from transformers.utils import is_jmespath_available


if is_jmespath_available():
    import jmespath
else:
    jmespath = None


def _parse_re_match(node_match: re.Match) -> dict | str:
    # If the regex has named groups, return a dict of those groups
    if node_match.groupdict():
        return {key: val for key, val in node_match.groupdict().items() if val is not None}
    # Otherwise the regex must have exactly one unnamed group, and we return that
    else:
        groups = list(node_match.groups())
        if len(groups) > 1:
            raise ValueError(f"Regex has multiple unnamed groups!\nGroups: {groups}\n")
        elif len(groups) == 0:
            raise ValueError(f"Regex has no capture groups:\n\n{node_match.group(0)}")
        return groups[0]


def recursive_parse(
    node_content: str | list | dict,
    node_schema: dict,
):
    """
    This function takes content and a JSON schema which includes
    regex extractors, and recursively parses the content. The output
    should be a data structure matching the schema.

    Args:
        node_content: The content corresponding to this node. Usually a string, but can be something else
                      if the parent node has multiple capture groups or named groups. In that case,
                      we generally pass the capture groups straight through to the children of this node
                      and don't do any parsing at this level.
        node_schema: The schema node controlling the parsing.

    Returns:
        The parsed data structure for the current node.
    """

    # If the schema has a const, we just return that value and do absolutely nothing else
    if "const" in node_schema:
        return node_schema["const"]

    # If the node content is None, we return None. EZ.
    if node_content is None:
        return None

    # If not, we have to do a little parsing. First, set some vars and do basic validation
    node_type = node_schema.get("type")
    has_regex = "x-regex" in node_schema or "x-regex-iterator" in node_schema or "x-regex-key-value" in node_schema
    if has_regex and not isinstance(node_content, str):
        raise TypeError(
            "Schema node got a non-string input, but has a regex for parsing.\n"
            f"Input: {node_content}\n"
            f"Schema: {node_schema}"
        )

    node_regex = node_schema.get("x-regex")
    node_regex_iterator = node_schema.get("x-regex-iterator")
    node_regex_to_dict = node_schema.get("x-regex-key-value")
    if node_regex is not None:
        node_match = re.search(node_regex, node_content, flags=re.DOTALL)
        if not node_match:
            return None
        node_content = _parse_re_match(node_match)
    if node_regex_iterator is not None:
        if node_type != "array":
            raise TypeError(f"Schema node with type {node_type} cannot use x-regex-iterator.\nSchema: {node_schema}")
        # Note that this can be applied after a standard node-regex search
        node_content = [
            _parse_re_match(node_match)
            for node_match in re.finditer(node_regex_iterator, node_content, flags=re.DOTALL)
        ]
        if not node_content:
            return None
    if node_regex_to_dict is not None:
        if node_type != "object":
            raise TypeError(f"Schema node with type {node_type} cannot use x-regex-key-value.\nSchema: {node_schema}")
        # Note that this can be applied after a standard node-regex search
        output_content = {}
        for node_match in re.finditer(node_regex_to_dict, node_content, flags=re.DOTALL):
            match_groups = _parse_re_match(node_match)
            if not isinstance(match_groups, dict) or "key" not in match_groups or "value" not in match_groups:
                raise ValueError(
                    f"Regex for x-regex-key-value must have named groups 'key' and 'value'.\n"
                    f"Match groups: {match_groups}\n"
                    f"Schema: {node_schema}"
                )
            output_content[match_groups["key"]] = match_groups["value"]
        node_content = output_content
        if not node_content:
            return None

    # Next, if the node has a parser, apply it. We do this after regexes so that the regex can extract
    # a substring to parse, if needed.
    if "x-parser" in node_schema:
        parser = node_schema["x-parser"]
        if parser == "json":
            if not isinstance(node_content, str):
                raise TypeError(
                    f"Node has JSON parser but got non-string input: {node_content}\nSchema: {node_schema}"
                )
            parser_args = node_schema.get("x-parser-args", {})
            transform = parser_args.get("transform")
            allow_non_json = parser_args.get("allow_non_json", False)
            try:
                parsed_json = json.loads(node_content)
            except json.JSONDecodeError as e:
                if allow_non_json:
                    parsed_json = node_content
                else:
                    raise ValueError(
                        f"Node has JSON parser but could not parse its contents as JSON. You can use the `allow_non_json` parser arg for nodes which may contain JSON or string content.\n\nContent: {node_content}\n\nError: {e}"
                    )
            if transform is not None:
                if jmespath is None:
                    raise ImportError(
                        "Chat response schema includes a jmespath transformation, but jmespath is not installed. You can install it with `pip install jmespath`."
                    )
                parsed_json = jmespath.search(parser_args["transform"], parsed_json)
            node_content = parsed_json
        else:
            raise ValueError(f"Unknown parser {parser} for schema node: {node_schema}")

    # If there's a mapping, apply it now
    if "x-mapping" in node_schema:
        if not isinstance(node_content, str):
            raise TypeError(
                f"Schema node with type {node_type} cannot use x-mapping on non-string content.\n"
                f"Content: {node_content}\n"
                f"Schema: {node_schema}"
            )
        mapping = node_schema["x-mapping"]
        if node_content in mapping:
            node_content = mapping[node_content]

    if "x-mapping-regex" in node_schema:
        if not isinstance(node_content, str):
            raise TypeError(
                f"Schema node with type {node_type} cannot use x-mapping-regex on non-string content.\n"
                f"Content: {node_content}\n"
                f"Schema: {node_schema}"
            )
        mapping_regex = node_schema["x-mapping-regex"]
        for pattern, replacement in mapping_regex.items():
            node_content = re.sub(pattern, replacement, node_content, flags=re.DOTALL)

    # Finally, handle parsed content based on schema type and recurse if required
    if node_type == "object":
        parsed_schema = {}
        if isinstance(node_content, str):
            # This means we don't have a regex at this level, so all of our child nodes need to parse the whole
            # string themselves to extract their value.
            if "properties" not in node_schema:
                raise ValueError(
                    f"Object node received string content but has no regex or parser to handle it.\n"
                    f"Content: {node_content}\n"
                    f"Schema: {node_schema}"
                )
            for key, child_node in node_schema["properties"].items():
                child_node_content = recursive_parse(node_content, node_schema["properties"][key])
                if child_node_content is not None:
                    parsed_schema[key] = child_node_content
            return parsed_schema
        elif isinstance(node_content, dict):
            for key, child_node in node_schema.get("properties", {}).items():
                if "const" in child_node:
                    parsed_schema[key] = child_node["const"]
                elif key in node_content:
                    parsed_schema[key] = recursive_parse(node_content[key], child_node)
                elif "default" in child_node:
                    parsed_schema[key] = child_node["default"]
            if "additionalProperties" in node_schema:
                for key, value in node_content.items():
                    if key not in node_schema.get("properties", {}):
                        parsed_schema[key] = recursive_parse(value, node_schema["additionalProperties"])
            return parsed_schema
        else:
            raise TypeError(f"Expected a dict or str for schema node with type object, got {node_content}")
    elif node_type == "array":
        if not node_content:
            return []
        parsed_schema = []
        if "items" in node_schema:
            if not isinstance(node_content, list):
                raise TypeError(f"Expected a list or regex for schema node with type array, got {node_content}")
            for item in node_content:
                parsed_schema.append(recursive_parse(item, node_schema["items"]))
            return parsed_schema
        elif "prefixItems" in node_schema:
            if not isinstance(node_content, list):
                if len(node_schema["prefixItems"]) == 1:
                    # If there's only one prefix item, this is a single item array, we can just wrap the string
                    node_content = [node_content]
                else:
                    raise TypeError(f"Expected a list or regex for schema node with type array, got {node_content}")
            if len(node_content) != len(node_schema["prefixItems"]):
                raise ValueError(
                    f"Array node has {len(node_content)} items, but schema only has "
                    f"{len(node_schema['prefixItems'])} prefixItems defined.\n"
                    f"Content: {node_content}\n"
                    f"Schema: {node_schema}"
                )
            for item, item_schema in zip(node_content, node_schema["prefixItems"]):
                parsed_schema.append(recursive_parse(item, item_schema))
            return parsed_schema
        else:
            raise ValueError(f"Array node has no items or prefixItems schema defined.\nSchema: {node_schema}")
    elif node_type in ("string", "integer", "number", "boolean"):
        if not isinstance(node_content, str):
            raise TypeError(f"Expected a string for schema node with type {node_type}, got {node_content}")
        if node_type == "integer":
            return int(node_content)
        elif node_type == "number":
            return float(node_content)
        elif node_type == "boolean":
            if node_content.lower() in ("true", "1"):
                return True
            elif node_content.lower() in ("false", "0"):
                return False
            else:
                raise ValueError(f"Invalid boolean value: {node_content}")
        else:
            # String type
            return node_content
    elif node_type is None:
        return node_content  # Don't touch it
    else:
        raise TypeError(f"Unsupported schema type {node_type} for node: {node_content}")