SkinbaseNova/app/Services/ContentSanitizer.php

<?php

namespace App\Services;

use App\Services\LegacySmileyMapper;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\Autolink\AutolinkExtension;
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
use League\CommonMark\Extension\Strikethrough\StrikethroughExtension;
use League\CommonMark\MarkdownConverter;

/**
 * Sanitizes and renders user-submitted content.
 *
 * Pipeline:
 *  1. Strip any raw HTML tags from input (we don't allow HTML)
 *  2. Convert legacy <br> / <b> / <i> hints from really old legacy content
 *  3. Parse subset of Markdown (bold, italic, code, links, line breaks)
 *  4. Sanitize the rendered HTML: whitelist-only tags, strip attributes
 *  5. Return safe HTML ready for storage or display
 */
class ContentSanitizer
{
    /** Maximum number of emoji allowed before triggering a flood error. */
    public const EMOJI_COUNT_MAX = 50;

    /**
     * Maximum ratio of emoji-to-total-characters before content is considered
     * an emoji flood (applies only when emoji count > 5 to avoid false positives
     * on very short strings like a single reaction comment).
     */
    public const EMOJI_DENSITY_MAX = 0.40;

    // HTML tags we allow in the final rendered output
    private const ALLOWED_TAGS = [
        'p', 'br', 'strong', 'em', 'code', 'pre',
        'a', 'ul', 'ol', 'li', 'blockquote', 'del',
    ];

    // Allowed attributes per tag
    private const ALLOWED_ATTRS = [
        'a' => ['href', 'title', 'rel', 'target'],
    ];

    private static ?MarkdownConverter $converter = null;

    // ─────────────────────────────────────────────────────────────────────────
    // Public API
    // ─────────────────────────────────────────────────────────────────────────

    /**
     * Convert raw user input (legacy or new) to sanitized HTML.
     *
     * @param string|null $raw
     * @return string  Safe HTML
     */
    public static function render(?string $raw): string
    {
        if ($raw === null || trim($raw) === '') {
            return '';
        }

        // 1. Convert legacy HTML fragments to Markdown-friendly text
        $text = static::legacyHtmlToMarkdown($raw);

        // 2. Parse Markdown → HTML
        $html = static::parseMarkdown($text);

        // 3. Sanitize HTML (strip disallowed tags / attrs)
        $html = static::sanitizeHtml($html);

        return $html;
    }

    /**
     * Normalize previously rendered HTML for display-time policy changes.
     * This is useful when stored HTML predates current link attributes or
     * when display rules depend on the author rather than the raw content.
     */
    public static function sanitizeRenderedHtml(?string $html, bool $allowLinks = true): string
    {
        if ($html === null || trim($html) === '') {
            return '';
        }

        return static::sanitizeHtml($html, $allowLinks);
    }

    /**
     * Strip ALL HTML from input, returning plain text with newlines preserved.
     */
    public static function stripToPlain(?string $html): string
    {
        if ($html === null) {
            return '';
        }

        // Convert <br> and <p> to line breaks before stripping
        $text = preg_replace(['/<br\s*\/?>/i', '/<\/p>/i'], "\n", $html);
        $text = strip_tags($text ?? '');
        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');

        return trim($text);
    }

    /**
     * Validate that a Markdown-lite string does not contain disallowed patterns.
     * Returns an array of validation errors (empty = OK).
     */
    public static function validate(string $raw): array
    {
        $errors = [];

        if (mb_strlen($raw) > 10_000) {
            $errors[] = 'Content exceeds maximum length of 10,000 characters.';
        }

        // Detect raw HTML tags (we forbid them)
        if (preg_match('/<[a-z][^>]*>/i', $raw)) {
            $errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.';
        }

        // Count emoji to prevent absolute spam
        $emojiCount = static::countEmoji($raw);
        if ($emojiCount > self::EMOJI_COUNT_MAX) {
            $errors[] = 'Too many emoji. Please limit emoji usage.';
        }

        // Reject emoji-flood content: density guard catches e.g. 15 emoji in a
        // 20-char string even when the absolute count is below EMOJI_COUNT_MAX.
        if ($emojiCount > 5) {
            $totalChars = mb_strlen($raw);
            if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) {
                $errors[] = 'Content is mostly emoji. Please add some text.';
            }
        }

        return $errors;
    }

    /**
     * Collapse consecutive runs of the same emoji in $text.
     *
     * Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is
     * consistent between new submissions and migrated legacy content.
     *
     * Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) → "🍺 🍺 🍺 🍺 🍺 ×7"
     *
     * @param  int $maxRun  Keep at most this many consecutive identical emoji.
     */
    public static function collapseFlood(string $text, int $maxRun = 5): string
    {
        return LegacySmileyMapper::collapseFlood($text, $maxRun);
    }

    // ─────────────────────────────────────────────────────────────────────────
    // Private helpers
    // ─────────────────────────────────────────────────────────────────────────

    /**
     * Convert legacy HTML-style formatting to Markdown equivalents.
     * This runs BEFORE Markdown parsing to handle old content gracefully.
     */
    private static function legacyHtmlToMarkdown(string $html): string
    {
        $replacements = [
            // Bold
            '/<b>(.*?)<\/b>/is'        => '**$1**',
            '/<strong>(.*?)<\/strong>/is' => '**$1**',
            // Italic
            '/<i>(.*?)<\/i>/is'        => '*$1*',
            '/<em>(.*?)<\/em>/is'      => '*$1*',
            // Line breaks → actual newlines
            '/<br\s*\/?>/i'            => "\n",
            // Paragraphs
            '/<p>(.*?)<\/p>/is'        => "$1\n\n",
            // Strip remaining tags
            '/<[^>]+>/'                => '',
        ];

        $result = $html;
        foreach ($replacements as $pattern => $replacement) {
            $result = preg_replace($pattern, $replacement, $result) ?? $result;
        }

        // Decode HTML entities (e.g. &amp; → &)
        $result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8');

        return $result;
    }

    /**
     * Parse Markdown-lite subset to HTML.
     */
    private static function parseMarkdown(string $text): string
    {
        $converter = static::getConverter();
        $result    = $converter->convert($text);

        return (string) $result->getContent();
    }

    /**
     * Whitelist-based HTML sanitizer.
     * Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes.
     */
    private static function sanitizeHtml(string $html, bool $allowLinks = true): string
    {
        $encodedHtml = mb_encode_numericentity(
            $html,
            [0x80, 0x10FFFF, 0, 0xFFFFFF],
            'UTF-8'
        );

        // Parse with DOMDocument
        $doc = new \DOMDocument('1.0', 'UTF-8');
        // Suppress warnings from malformed fragments
        libxml_use_internal_errors(true);
        $doc->loadHTML(
            '<?xml encoding="UTF-8"><html><body>' . $encodedHtml . '</body></html>',
            LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
        );
        libxml_clear_errors();

        static::cleanNode($doc->getElementsByTagName('body')->item(0), $allowLinks);

        // Serialize back, removing the wrapping html/body
        $body  = $doc->getElementsByTagName('body')->item(0);
        $inner = '';
        foreach ($body->childNodes as $child) {
            $inner .= $doc->saveHTML($child);
        }

        // Fix self-closing <a></a> etc.
        return trim(html_entity_decode($inner, ENT_QUOTES | ENT_HTML5, 'UTF-8'));
    }

    /**
     * Recursively clean a DOMNode — strip forbidden tags/attributes.
     */
    private static function cleanNode(\DOMNode $node, bool $allowLinks = true): void
    {
        $toRemove    = [];
        $toUnwrap    = [];

        foreach ($node->childNodes as $child) {
            if ($child->nodeType === XML_ELEMENT_NODE) {
                if (! $child instanceof \DOMElement) {
                    continue;
                }

                $tag = strtolower($child->nodeName);

                if (! in_array($tag, self::ALLOWED_TAGS, true)) {
                    // Replace element with its text content
                    $toUnwrap[] = $child;
                } else {
                    // Strip disallowed attributes
                    $allowedAttrs = self::ALLOWED_ATTRS[$tag] ?? [];
                    $attrsToRemove = [];
                    foreach ($child->attributes as $attr) {
                        if (! in_array($attr->nodeName, $allowedAttrs, true)) {
                            $attrsToRemove[] = $attr->nodeName;
                        }
                    }
                    foreach ($attrsToRemove as $attrName) {
                        $child->removeAttribute($attrName);
                    }

                    // Force external links to be safe
                    if ($tag === 'a') {
                        if (! $allowLinks) {
                            $toUnwrap[] = $child;
                            continue;
                        }

                        $href = $child->getAttribute('href');
                        if ($href && ! static::isSafeUrl($href)) {
                            $toUnwrap[] = $child;
                            continue;
                        }
                        $child->setAttribute('rel', 'noopener noreferrer nofollow');
                        $child->setAttribute('target', '_blank');
                    }

                    // Recurse
                    static::cleanNode($child, $allowLinks);
                }
            }
        }

        // Unwrap forbidden elements (replace with their children)
        foreach ($toUnwrap as $el) {
            while ($el->firstChild) {
                $node->insertBefore($el->firstChild, $el);
            }
            $node->removeChild($el);
        }
    }

    /**
     * Very conservative URL whitelist.
     */
    private static function isSafeUrl(string $url): bool
    {
        $lower = strtolower(trim($url));

        // Allow relative paths and anchors
        if (str_starts_with($url, '/') || str_starts_with($url, '#')) {
            return true;
        }

        // Only allow http(s)
        return str_starts_with($lower, 'http://') || str_starts_with($lower, 'https://');
    }

    /**
     * Count Unicode emoji in a string (basic heuristic).
     */
    private static function countEmoji(string $text): int
    {
        // Match common emoji ranges
        preg_match_all(
            '/[\x{1F300}-\x{1FAD6}\x{2600}-\x{27BF}\x{FE00}-\x{FEFF}]/u',
            $text,
            $matches
        );

        return count($matches[0]);
    }

    /**
     * Lazy-load and cache the Markdown converter.
     */
    private static function getConverter(): MarkdownConverter
    {
        if (static::$converter === null) {
            $env = new Environment([
                'html_input'         => 'strip',
                'allow_unsafe_links' => false,
                'max_nesting_level'  => 10,
            ]);
            $env->addExtension(new CommonMarkCoreExtension());
            $env->addExtension(new AutolinkExtension());
            $env->addExtension(new StrikethroughExtension());

            static::$converter = new MarkdownConverter($env);
        }

        return static::$converter;
    }
}