messages implemented

This commit is contained in:
2026-02-26 21:12:32 +01:00
parent d0aefc5ddc
commit 15b7b77d20
168 changed files with 14728 additions and 6786 deletions

View File

@@ -0,0 +1,323 @@
<?php
namespace App\Services;
use App\Services\LegacySmileyMapper;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\Autolink\AutolinkExtension;
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
use League\CommonMark\Extension\Strikethrough\StrikethroughExtension;
use League\CommonMark\MarkdownConverter;
/**
* Sanitizes and renders user-submitted content.
*
* Pipeline:
* 1. Strip any raw HTML tags from input (we don't allow HTML)
* 2. Convert legacy <br> / <b> / <i> hints from really old legacy content
* 3. Parse subset of Markdown (bold, italic, code, links, line breaks)
* 4. Sanitize the rendered HTML: whitelist-only tags, strip attributes
* 5. Return safe HTML ready for storage or display
*/
class ContentSanitizer
{
/** Maximum number of emoji allowed before triggering a flood error. */
public const EMOJI_COUNT_MAX = 50;
/**
* Maximum ratio of emoji-to-total-characters before content is considered
* an emoji flood (applies only when emoji count > 5 to avoid false positives
* on very short strings like a single reaction comment).
*/
public const EMOJI_DENSITY_MAX = 0.40;
// HTML tags we allow in the final rendered output
private const ALLOWED_TAGS = [
'p', 'br', 'strong', 'em', 'code', 'pre',
'a', 'ul', 'ol', 'li', 'blockquote', 'del',
];
// Allowed attributes per tag
private const ALLOWED_ATTRS = [
'a' => ['href', 'title', 'rel', 'target'],
];
private static ?MarkdownConverter $converter = null;
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert raw user input (legacy or new) to sanitized HTML.
*
* @param string|null $raw
* @return string Safe HTML
*/
public static function render(?string $raw): string
{
if ($raw === null || trim($raw) === '') {
return '';
}
// 1. Convert legacy HTML fragments to Markdown-friendly text
$text = static::legacyHtmlToMarkdown($raw);
// 2. Parse Markdown → HTML
$html = static::parseMarkdown($text);
// 3. Sanitize HTML (strip disallowed tags / attrs)
$html = static::sanitizeHtml($html);
return $html;
}
/**
* Strip ALL HTML from input, returning plain text with newlines preserved.
*/
public static function stripToPlain(?string $html): string
{
if ($html === null) {
return '';
}
// Convert <br> and <p> to line breaks before stripping
$text = preg_replace(['/<br\s*\/?>/i', '/<\/p>/i'], "\n", $html);
$text = strip_tags($text ?? '');
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return trim($text);
}
/**
* Validate that a Markdown-lite string does not contain disallowed patterns.
* Returns an array of validation errors (empty = OK).
*/
public static function validate(string $raw): array
{
$errors = [];
if (mb_strlen($raw) > 10_000) {
$errors[] = 'Content exceeds maximum length of 10,000 characters.';
}
// Detect raw HTML tags (we forbid them)
if (preg_match('/<[a-z][^>]*>/i', $raw)) {
$errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.';
}
// Count emoji to prevent absolute spam
$emojiCount = static::countEmoji($raw);
if ($emojiCount > self::EMOJI_COUNT_MAX) {
$errors[] = 'Too many emoji. Please limit emoji usage.';
}
// Reject emoji-flood content: density guard catches e.g. 15 emoji in a
// 20-char string even when the absolute count is below EMOJI_COUNT_MAX.
if ($emojiCount > 5) {
$totalChars = mb_strlen($raw);
if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) {
$errors[] = 'Content is mostly emoji. Please add some text.';
}
}
return $errors;
}
/**
* Collapse consecutive runs of the same emoji in $text.
*
* Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is
* consistent between new submissions and migrated legacy content.
*
* Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) "🍺 🍺 🍺 🍺 🍺 ×7"
*
* @param int $maxRun Keep at most this many consecutive identical emoji.
*/
public static function collapseFlood(string $text, int $maxRun = 5): string
{
return LegacySmileyMapper::collapseFlood($text, $maxRun);
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert legacy HTML-style formatting to Markdown equivalents.
* This runs BEFORE Markdown parsing to handle old content gracefully.
*/
private static function legacyHtmlToMarkdown(string $html): string
{
$replacements = [
// Bold
'/<b>(.*?)<\/b>/is' => '**$1**',
'/<strong>(.*?)<\/strong>/is' => '**$1**',
// Italic
'/<i>(.*?)<\/i>/is' => '*$1*',
'/<em>(.*?)<\/em>/is' => '*$1*',
// Line breaks → actual newlines
'/<br\s*\/?>/i' => "\n",
// Paragraphs
'/<p>(.*?)<\/p>/is' => "$1\n\n",
// Strip remaining tags
'/<[^>]+>/' => '',
];
$result = $html;
foreach ($replacements as $pattern => $replacement) {
$result = preg_replace($pattern, $replacement, $result) ?? $result;
}
// Decode HTML entities (e.g. &amp; → &)
$result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return $result;
}
/**
* Parse Markdown-lite subset to HTML.
*/
private static function parseMarkdown(string $text): string
{
$converter = static::getConverter();
$result = $converter->convert($text);
return (string) $result->getContent();
}
/**
* Whitelist-based HTML sanitizer.
* Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes.
*/
private static function sanitizeHtml(string $html): string
{
// Parse with DOMDocument
$doc = new \DOMDocument('1.0', 'UTF-8');
// Suppress warnings from malformed fragments
libxml_use_internal_errors(true);
$doc->loadHTML(
'<html><body>' . $html . '</body></html>',
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
);
libxml_clear_errors();
static::cleanNode($doc->getElementsByTagName('body')->item(0));
// Serialize back, removing the wrapping html/body
$body = $doc->getElementsByTagName('body')->item(0);
$inner = '';
foreach ($body->childNodes as $child) {
$inner .= $doc->saveHTML($child);
}
// Fix self-closing <a></a> etc.
return trim($inner);
}
/**
* Recursively clean a DOMNode strip forbidden tags/attributes.
*/
private static function cleanNode(\DOMNode $node): void
{
$toRemove = [];
$toUnwrap = [];
foreach ($node->childNodes as $child) {
if ($child->nodeType === XML_ELEMENT_NODE) {
$tag = strtolower($child->nodeName);
if (! in_array($tag, self::ALLOWED_TAGS, true)) {
// Replace element with its text content
$toUnwrap[] = $child;
} else {
// Strip disallowed attributes
$allowedAttrs = self::ALLOWED_ATTRS[$tag] ?? [];
$attrsToRemove = [];
foreach ($child->attributes as $attr) {
if (! in_array($attr->nodeName, $allowedAttrs, true)) {
$attrsToRemove[] = $attr->nodeName;
}
}
foreach ($attrsToRemove as $attrName) {
$child->removeAttribute($attrName);
}
// Force external links to be safe
if ($tag === 'a') {
$href = $child->getAttribute('href');
if ($href && ! static::isSafeUrl($href)) {
$toUnwrap[] = $child;
continue;
}
$child->setAttribute('rel', 'noopener noreferrer nofollow');
$child->setAttribute('target', '_blank');
}
// Recurse
static::cleanNode($child);
}
}
}
// Unwrap forbidden elements (replace with their children)
foreach ($toUnwrap as $el) {
while ($el->firstChild) {
$node->insertBefore($el->firstChild, $el);
}
$node->removeChild($el);
}
}
/**
* Very conservative URL whitelist.
*/
private static function isSafeUrl(string $url): bool
{
$lower = strtolower(trim($url));
// Allow relative paths and anchors
if (str_starts_with($url, '/') || str_starts_with($url, '#')) {
return true;
}
// Only allow http(s)
return str_starts_with($lower, 'http://') || str_starts_with($lower, 'https://');
}
/**
* Count Unicode emoji in a string (basic heuristic).
*/
private static function countEmoji(string $text): int
{
// Match common emoji ranges
preg_match_all(
'/[\x{1F300}-\x{1FAD6}\x{2600}-\x{27BF}\x{FE00}-\x{FEFF}]/u',
$text,
$matches
);
return count($matches[0]);
}
/**
* Lazy-load and cache the Markdown converter.
*/
private static function getConverter(): MarkdownConverter
{
if (static::$converter === null) {
$env = new Environment([
'html_input' => 'strip',
'allow_unsafe_links' => false,
'max_nesting_level' => 10,
]);
$env->addExtension(new CommonMarkCoreExtension());
$env->addExtension(new AutolinkExtension());
$env->addExtension(new StrikethroughExtension());
static::$converter = new MarkdownConverter($env);
}
return static::$converter;
}
}