# Copyright 2022 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Provides utility for formatting repo manifests.

There does not seem to be any good off-the-shelf XML formatters.

The format we enforce:
* 2 space indents
* All elements start on new lines
* One attribute per line for most elements
* Attributes are indented & aligned to the first one
* Attributes are kept in a consistent order
* Use <foo/> closing syntax when element has no child elements
* Space before the closing />
* No space before the closing >
* Groups of <project> nodes are sorted by checkout path
* Single space between element name and attributes
* Comments have spaces around the <!-- and --> markers
* Comments are indented to the end of the <!-- marker
* Non-whitespace text nodes are deleted, except for <notice> entries

See the files in our manifest repos for living examples.
"""

import collections
import io
import os
import re
from typing import List, Optional, Union
from xml.dom import minidom


def attrsToDict(node):
    """Turn the attributes into a dict for easier management.

    The XML API is not easy to work with.
    """
    ret = collections.OrderedDict()
    for i in range(node.attributes.length):
        attr = node.attributes.item(i)
        ret[attr.name] = attr.value
    return ret


# Force ordering for some elements/attributes.
ATTR_ORDER = {
    "annotation": ("name", "value", "keep"),
    "copyfile": ("src", "dest"),
    "linkfile": ("src", "dest"),
    "project": ("path", "remote", "name", "revision", "dest-branch", "groups"),
    "remote": ("name", "alias", "fetch", "pushurl", "review", "revision"),
    "repo-hooks": ("in-project", "enabled-list"),
}


def orderAttrs(node):
    """Yields |node|'s attributes in the preferred order."""
    attrs = attrsToDict(node)
    for name in ATTR_ORDER.get(node.nodeName, ()):
        if name in attrs:
            yield (name, attrs.pop(name))
    yield from attrs.items()


def has_children(node) -> bool:
    """Determine whether |node| has any (useful) children."""
    if not node.childNodes:
        return False

    # In this case, we expect to handle child text nodes separately.
    if node.nodeName == "notice":
        return True

    # Make sure the children aren't just whitespace text nodes.
    for child in node.childNodes:
        if child.nodeType != child.TEXT_NODE:
            return True

        # Make sure the text node is just whitespace.  Who knows!
        assert child.data.strip() == "", f"Unexpected text node: {child}"

    return False


def _sort_children(nodes: List[minidom.Node]) -> List[minidom.Node]:
    """Sort |nodes| based on attributes.

    Currently this only sorts <project> nodes.
    """

    def project_key(node):
        """Find the sort key for the project |node|."""
        attrs = attrsToDict(node)
        # Splitting out numbers allows for natural sorting of versioned paths.
        parts = re.split(r"([0-9]+)", attrs.get("path", attrs["name"]).lower())
        for i, part in enumerate(parts):
            try:
                parts[i] = int(part)
            except ValueError:
                pass
        return parts

    def flush(*args):
        """Sort the current block of nodes and yield them to the caller."""
        # The nodes might contain more than just projects.  Sorting all the
        # nodes is tricky as we don't know how to compare <project> to
        # whitespace text nodes such that they stay in the right order.
        # Instead, we'll extract all the project nodes and sort them.  If things
        # were sorted, then we yield the block as-is.  If things weren't sorted,
        # we'll weave the existing block together with the sorted block by
        # yielding the next sorted project.  This can lead to slightly
        # undesirable behavior when a comment node is for a specific project,
        # but we leave it to the developer to fix that.  Fixing this in the code
        # is possible, but would require a bit more logic, so we won't bother
        # for now since it should be pretty rare.
        #
        # For example, <comment><proj b><proj a> will be sorted as
        # <comment><proj a><proj b> instead of <proj a><comment><proj b>.
        project_nodes = [x for x in pending_nodes if x.nodeName == "project"]
        sorted_project_nodes = sorted(project_nodes, key=project_key)
        if project_nodes != sorted_project_nodes:
            while pending_nodes:
                node = pending_nodes.pop(0)
                if node.nodeName == "project":
                    yield sorted_project_nodes.pop(0)
                else:
                    yield node
        else:
            yield from pending_nodes
        yield from args
        pending_nodes.clear()

    # Walk the nodes and gather them as groups.  Blank lines and non-project
    # nodes mark the end of a group of nodes.
    pending_nodes = []
    for node in nodes:
        if node.nodeType == node.COMMENT_NODE:
            pending_nodes.append(node)
        elif node.nodeType == node.TEXT_NODE:
            # Allow blank lines to reset sorting.
            if node.data.strip(" ") != "\n":
                yield from flush(node)
            else:
                pending_nodes.append(node)
        elif node.nodeName != "project":
            yield from flush(node)
        else:
            pending_nodes.append(node)

    yield from flush()


def Data(
    data: str,
    # pylint: disable=unused-argument
    path: Optional[Union[str, os.PathLike]] = None,
) -> str:
    """Format |data|.

    Args:
        data: The file content to lint.
        path: The file name for diagnostics/configs/etc...

    Returns:
        Formatted data.
    """
    ONE_LINE_NODES = {"annotation", "copyfile", "linkfile"}

    def _format(
        buffer: io.StringIO, root: minidom.Node, level: int = 0
    ) -> None:
        """Recursively format the nodes starting at |root|."""
        indent = "  " * level

        for node in _sort_children(root.childNodes):
            if node.nodeType == node.COMMENT_NODE:
                # Reformat comment nodes.
                # If the --> was on a line by itself, pull it up.
                comment = node.data.strip()
                lines = [x.strip() for x in comment.splitlines()]
                # If it's a one-liner, leave it as-is.
                if len(lines) == 1:
                    comment = lines[0]
                elif lines:
                    # Indent lines that have content.
                    comment = f"{lines.pop(0)}\n" + "\n".join(
                        f"{indent}     {x}" if x else "" for x in lines
                    )
                buffer.write(f"{indent}<!-- {comment} -->")

                # For some reason the minidom implementation doesn't generate
                # text nodes in the top level, so comments all get compacted
                # into a single line. Fake it by inserting our own text node
                # newline.
                # https://bugs.python.org/issue42341
                if not level:
                    buffer.write("\n")
            elif node.nodeType == node.TEXT_NODE:
                # These are inter-node whitespace (indentation) and newlines.
                # We will handle reformatting of indentation below, but we want
                # to preserve any existing newlines as we largely don't enforce
                # anything with them atm.
                output = node.data.strip(" ")
                # We don't allow multiple blank lines in a row.
                output = re.sub(r"\n\n\n+", "\n\n", output)
                # Finally, strip non-whitespace characters.  There never should
                # be any.
                buffer.write(re.sub(r"[^\s]+", "", output))
            elif node.nodeName == "notice":
                # For a <notice> tag, we want to preserve the inner text as-is,
                # without losing the opening and closing tags as appropriate.
                node_start = f"{indent}<{node.nodeName}>"
                buffer.write(node_start)

                # Since all other text nodes strip non-whitespace characters,
                # handle the text node(s) contained under notice separately.
                assert has_children(node), "provided <notice> element is empty"
                for child in node.childNodes:
                    assert child.nodeType == node.TEXT_NODE, (
                        "<notice> element contains children"
                        f"that aren't text nodes: {repr(child)}"
                    )
                    buffer.write(child.data.strip())

                buffer.write(f"\n{indent}</{node.nodeName}>")
            else:
                # We don't expect any other node type in manifests currently.
                assert node.nodeType == node.ELEMENT_NODE, str(node)

                # Reformat normal nodes, their attributes, and their children.
                node_start = f"{indent}<{node.nodeName}"
                buffer.write(node_start)

                attr_indent = " " * len(node_start)
                if node.attributes:
                    # Indent all the attributes.
                    first = True
                    for name, value in orderAttrs(node):
                        if not first and node.nodeName not in ONE_LINE_NODES:
                            buffer.write("\n" + attr_indent)
                        first = False
                        # No attributes currently need leading/trailing
                        # whitespace.
                        value = value.strip()
                        # No attributes currently use embedded whitespace.  This
                        # might change, but enforce it for now until someone
                        # complains.
                        value = re.sub(r"\s+", "", value)
                        buffer.write(f' {name}="{value}"')

                # Close the tag.
                if not has_children(node):
                    if node.attributes:
                        buffer.write(" ")
                    buffer.write("/>")
                else:
                    buffer.write(">")

                # Yield any children.
                _format(buffer, node, level + 1)

                # Close the element if we haven't yet.
                if has_children(node):
                    buffer.write(f"{indent}</{node.nodeName}>")

    manifest = minidom.parseString(data)
    buffer = io.StringIO()
    buffer.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    _format(buffer, manifest)
    buffer.write("\n")
    return buffer.getvalue()
