Files
SkinbaseNova/app/Services/ContentSanitizer.php
2026-03-28 19:15:39 +01:00

347 lines
12 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace App\Services;
use App\Services\LegacySmileyMapper;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\Autolink\AutolinkExtension;
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
use League\CommonMark\Extension\Strikethrough\StrikethroughExtension;
use League\CommonMark\MarkdownConverter;
/**
* Sanitizes and renders user-submitted content.
*
* Pipeline:
* 1. Strip any raw HTML tags from input (we don't allow HTML)
* 2. Convert legacy <br> / <b> / <i> hints from really old legacy content
* 3. Parse subset of Markdown (bold, italic, code, links, line breaks)
* 4. Sanitize the rendered HTML: whitelist-only tags, strip attributes
* 5. Return safe HTML ready for storage or display
*/
class ContentSanitizer
{
/** Maximum number of emoji allowed before triggering a flood error. */
public const EMOJI_COUNT_MAX = 50;
/**
* Maximum ratio of emoji-to-total-characters before content is considered
* an emoji flood (applies only when emoji count > 5 to avoid false positives
* on very short strings like a single reaction comment).
*/
public const EMOJI_DENSITY_MAX = 0.40;
// HTML tags we allow in the final rendered output
private const ALLOWED_TAGS = [
'p', 'br', 'strong', 'em', 'code', 'pre',
'a', 'ul', 'ol', 'li', 'blockquote', 'del',
];
// Allowed attributes per tag
private const ALLOWED_ATTRS = [
'a' => ['href', 'title', 'rel', 'target'],
];
private static ?MarkdownConverter $converter = null;
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert raw user input (legacy or new) to sanitized HTML.
*
* @param string|null $raw
* @return string Safe HTML
*/
public static function render(?string $raw): string
{
if ($raw === null || trim($raw) === '') {
return '';
}
// 1. Convert legacy HTML fragments to Markdown-friendly text
$text = static::legacyHtmlToMarkdown($raw);
// 2. Parse Markdown → HTML
$html = static::parseMarkdown($text);
// 3. Sanitize HTML (strip disallowed tags / attrs)
$html = static::sanitizeHtml($html);
return $html;
}
/**
* Normalize previously rendered HTML for display-time policy changes.
* This is useful when stored HTML predates current link attributes or
* when display rules depend on the author rather than the raw content.
*/
public static function sanitizeRenderedHtml(?string $html, bool $allowLinks = true): string
{
if ($html === null || trim($html) === '') {
return '';
}
return static::sanitizeHtml($html, $allowLinks);
}
/**
* Strip ALL HTML from input, returning plain text with newlines preserved.
*/
public static function stripToPlain(?string $html): string
{
if ($html === null) {
return '';
}
// Convert <br> and <p> to line breaks before stripping
$text = preg_replace(['/<br\s*\/?>/i', '/<\/p>/i'], "\n", $html);
$text = strip_tags($text ?? '');
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return trim($text);
}
/**
* Validate that a Markdown-lite string does not contain disallowed patterns.
* Returns an array of validation errors (empty = OK).
*/
public static function validate(string $raw): array
{
$errors = [];
if (mb_strlen($raw) > 10_000) {
$errors[] = 'Content exceeds maximum length of 10,000 characters.';
}
// Detect raw HTML tags (we forbid them)
if (preg_match('/<[a-z][^>]*>/i', $raw)) {
$errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.';
}
// Count emoji to prevent absolute spam
$emojiCount = static::countEmoji($raw);
if ($emojiCount > self::EMOJI_COUNT_MAX) {
$errors[] = 'Too many emoji. Please limit emoji usage.';
}
// Reject emoji-flood content: density guard catches e.g. 15 emoji in a
// 20-char string even when the absolute count is below EMOJI_COUNT_MAX.
if ($emojiCount > 5) {
$totalChars = mb_strlen($raw);
if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) {
$errors[] = 'Content is mostly emoji. Please add some text.';
}
}
return $errors;
}
/**
* Collapse consecutive runs of the same emoji in $text.
*
* Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is
* consistent between new submissions and migrated legacy content.
*
* Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) → "🍺 🍺 🍺 🍺 🍺 ×7"
*
* @param int $maxRun Keep at most this many consecutive identical emoji.
*/
public static function collapseFlood(string $text, int $maxRun = 5): string
{
return LegacySmileyMapper::collapseFlood($text, $maxRun);
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert legacy HTML-style formatting to Markdown equivalents.
* This runs BEFORE Markdown parsing to handle old content gracefully.
*/
private static function legacyHtmlToMarkdown(string $html): string
{
$replacements = [
// Bold
'/<b>(.*?)<\/b>/is' => '**$1**',
'/<strong>(.*?)<\/strong>/is' => '**$1**',
// Italic
'/<i>(.*?)<\/i>/is' => '*$1*',
'/<em>(.*?)<\/em>/is' => '*$1*',
// Line breaks → actual newlines
'/<br\s*\/?>/i' => "\n",
// Paragraphs
'/<p>(.*?)<\/p>/is' => "$1\n\n",
// Strip remaining tags
'/<[^>]+>/' => '',
];
$result = $html;
foreach ($replacements as $pattern => $replacement) {
$result = preg_replace($pattern, $replacement, $result) ?? $result;
}
// Decode HTML entities (e.g. &amp; → &)
$result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return $result;
}
/**
* Parse Markdown-lite subset to HTML.
*/
private static function parseMarkdown(string $text): string
{
$converter = static::getConverter();
$result = $converter->convert($text);
return (string) $result->getContent();
}
/**
* Whitelist-based HTML sanitizer.
* Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes.
*/
private static function sanitizeHtml(string $html, bool $allowLinks = true): string
{
// Parse with DOMDocument
$doc = new \DOMDocument('1.0', 'UTF-8');
// Suppress warnings from malformed fragments
libxml_use_internal_errors(true);
$doc->loadHTML(
'<?xml encoding="UTF-8"><html><body>' . $html . '</body></html>',
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
);
libxml_clear_errors();
static::cleanNode($doc->getElementsByTagName('body')->item(0), $allowLinks);
// Serialize back, removing the wrapping html/body
$body = $doc->getElementsByTagName('body')->item(0);
$inner = '';
foreach ($body->childNodes as $child) {
$inner .= $doc->saveHTML($child);
}
// Fix self-closing <a></a> etc.
return trim($inner);
}
/**
* Recursively clean a DOMNode — strip forbidden tags/attributes.
*/
private static function cleanNode(\DOMNode $node, bool $allowLinks = true): void
{
$toRemove = [];
$toUnwrap = [];
foreach ($node->childNodes as $child) {
if ($child->nodeType === XML_ELEMENT_NODE) {
if (! $child instanceof \DOMElement) {
continue;
}
$tag = strtolower($child->nodeName);
if (! in_array($tag, self::ALLOWED_TAGS, true)) {
// Replace element with its text content
$toUnwrap[] = $child;
} else {
// Strip disallowed attributes
$allowedAttrs = self::ALLOWED_ATTRS[$tag] ?? [];
$attrsToRemove = [];
foreach ($child->attributes as $attr) {
if (! in_array($attr->nodeName, $allowedAttrs, true)) {
$attrsToRemove[] = $attr->nodeName;
}
}
foreach ($attrsToRemove as $attrName) {
$child->removeAttribute($attrName);
}
// Force external links to be safe
if ($tag === 'a') {
if (! $allowLinks) {
$toUnwrap[] = $child;
continue;
}
$href = $child->getAttribute('href');
if ($href && ! static::isSafeUrl($href)) {
$toUnwrap[] = $child;
continue;
}
$child->setAttribute('rel', 'noopener noreferrer nofollow');
$child->setAttribute('target', '_blank');
}
// Recurse
static::cleanNode($child, $allowLinks);
}
}
}
// Unwrap forbidden elements (replace with their children)
foreach ($toUnwrap as $el) {
while ($el->firstChild) {
$node->insertBefore($el->firstChild, $el);
}
$node->removeChild($el);
}
}
/**
* Very conservative URL whitelist.
*/
private static function isSafeUrl(string $url): bool
{
$lower = strtolower(trim($url));
// Allow relative paths and anchors
if (str_starts_with($url, '/') || str_starts_with($url, '#')) {
return true;
}
// Only allow http(s)
return str_starts_with($lower, 'http://') || str_starts_with($lower, 'https://');
}
/**
* Count Unicode emoji in a string (basic heuristic).
*/
private static function countEmoji(string $text): int
{
// Match common emoji ranges
preg_match_all(
'/[\x{1F300}-\x{1FAD6}\x{2600}-\x{27BF}\x{FE00}-\x{FEFF}]/u',
$text,
$matches
);
return count($matches[0]);
}
/**
* Lazy-load and cache the Markdown converter.
*/
private static function getConverter(): MarkdownConverter
{
if (static::$converter === null) {
$env = new Environment([
'html_input' => 'strip',
'allow_unsafe_links' => false,
'max_nesting_level' => 10,
]);
$env->addExtension(new CommonMarkCoreExtension());
$env->addExtension(new AutolinkExtension());
$env->addExtension(new StrikethroughExtension());
static::$converter = new MarkdownConverter($env);
}
return static::$converter;
}
}