347 lines
12 KiB
PHP
347 lines
12 KiB
PHP
<?php
|
||
|
||
namespace App\Services;
|
||
|
||
use App\Services\LegacySmileyMapper;
|
||
use League\CommonMark\Environment\Environment;
|
||
use League\CommonMark\Extension\Autolink\AutolinkExtension;
|
||
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
|
||
use League\CommonMark\Extension\Strikethrough\StrikethroughExtension;
|
||
use League\CommonMark\MarkdownConverter;
|
||
|
||
/**
|
||
* Sanitizes and renders user-submitted content.
|
||
*
|
||
* Pipeline:
|
||
* 1. Strip any raw HTML tags from input (we don't allow HTML)
|
||
* 2. Convert legacy <br> / <b> / <i> hints from really old legacy content
|
||
* 3. Parse subset of Markdown (bold, italic, code, links, line breaks)
|
||
* 4. Sanitize the rendered HTML: whitelist-only tags, strip attributes
|
||
* 5. Return safe HTML ready for storage or display
|
||
*/
|
||
class ContentSanitizer
|
||
{
|
||
/** Maximum number of emoji allowed before triggering a flood error. */
|
||
public const EMOJI_COUNT_MAX = 50;
|
||
|
||
/**
|
||
* Maximum ratio of emoji-to-total-characters before content is considered
|
||
* an emoji flood (applies only when emoji count > 5 to avoid false positives
|
||
* on very short strings like a single reaction comment).
|
||
*/
|
||
public const EMOJI_DENSITY_MAX = 0.40;
|
||
|
||
// HTML tags we allow in the final rendered output
|
||
private const ALLOWED_TAGS = [
|
||
'p', 'br', 'strong', 'em', 'code', 'pre',
|
||
'a', 'ul', 'ol', 'li', 'blockquote', 'del',
|
||
];
|
||
|
||
// Allowed attributes per tag
|
||
private const ALLOWED_ATTRS = [
|
||
'a' => ['href', 'title', 'rel', 'target'],
|
||
];
|
||
|
||
private static ?MarkdownConverter $converter = null;
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// Public API
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Convert raw user input (legacy or new) to sanitized HTML.
|
||
*
|
||
* @param string|null $raw
|
||
* @return string Safe HTML
|
||
*/
|
||
public static function render(?string $raw): string
|
||
{
|
||
if ($raw === null || trim($raw) === '') {
|
||
return '';
|
||
}
|
||
|
||
// 1. Convert legacy HTML fragments to Markdown-friendly text
|
||
$text = static::legacyHtmlToMarkdown($raw);
|
||
|
||
// 2. Parse Markdown → HTML
|
||
$html = static::parseMarkdown($text);
|
||
|
||
// 3. Sanitize HTML (strip disallowed tags / attrs)
|
||
$html = static::sanitizeHtml($html);
|
||
|
||
return $html;
|
||
}
|
||
|
||
/**
|
||
* Normalize previously rendered HTML for display-time policy changes.
|
||
* This is useful when stored HTML predates current link attributes or
|
||
* when display rules depend on the author rather than the raw content.
|
||
*/
|
||
public static function sanitizeRenderedHtml(?string $html, bool $allowLinks = true): string
|
||
{
|
||
if ($html === null || trim($html) === '') {
|
||
return '';
|
||
}
|
||
|
||
return static::sanitizeHtml($html, $allowLinks);
|
||
}
|
||
|
||
/**
|
||
* Strip ALL HTML from input, returning plain text with newlines preserved.
|
||
*/
|
||
public static function stripToPlain(?string $html): string
|
||
{
|
||
if ($html === null) {
|
||
return '';
|
||
}
|
||
|
||
// Convert <br> and <p> to line breaks before stripping
|
||
$text = preg_replace(['/<br\s*\/?>/i', '/<\/p>/i'], "\n", $html);
|
||
$text = strip_tags($text ?? '');
|
||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
|
||
return trim($text);
|
||
}
|
||
|
||
/**
|
||
* Validate that a Markdown-lite string does not contain disallowed patterns.
|
||
* Returns an array of validation errors (empty = OK).
|
||
*/
|
||
public static function validate(string $raw): array
|
||
{
|
||
$errors = [];
|
||
|
||
if (mb_strlen($raw) > 10_000) {
|
||
$errors[] = 'Content exceeds maximum length of 10,000 characters.';
|
||
}
|
||
|
||
// Detect raw HTML tags (we forbid them)
|
||
if (preg_match('/<[a-z][^>]*>/i', $raw)) {
|
||
$errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.';
|
||
}
|
||
|
||
// Count emoji to prevent absolute spam
|
||
$emojiCount = static::countEmoji($raw);
|
||
if ($emojiCount > self::EMOJI_COUNT_MAX) {
|
||
$errors[] = 'Too many emoji. Please limit emoji usage.';
|
||
}
|
||
|
||
// Reject emoji-flood content: density guard catches e.g. 15 emoji in a
|
||
// 20-char string even when the absolute count is below EMOJI_COUNT_MAX.
|
||
if ($emojiCount > 5) {
|
||
$totalChars = mb_strlen($raw);
|
||
if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) {
|
||
$errors[] = 'Content is mostly emoji. Please add some text.';
|
||
}
|
||
}
|
||
|
||
return $errors;
|
||
}
|
||
|
||
/**
|
||
* Collapse consecutive runs of the same emoji in $text.
|
||
*
|
||
* Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is
|
||
* consistent between new submissions and migrated legacy content.
|
||
*
|
||
* Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) → "🍺 🍺 🍺 🍺 🍺 ×7"
|
||
*
|
||
* @param int $maxRun Keep at most this many consecutive identical emoji.
|
||
*/
|
||
public static function collapseFlood(string $text, int $maxRun = 5): string
|
||
{
|
||
return LegacySmileyMapper::collapseFlood($text, $maxRun);
|
||
}
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// Private helpers
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Convert legacy HTML-style formatting to Markdown equivalents.
|
||
* This runs BEFORE Markdown parsing to handle old content gracefully.
|
||
*/
|
||
private static function legacyHtmlToMarkdown(string $html): string
|
||
{
|
||
$replacements = [
|
||
// Bold
|
||
'/<b>(.*?)<\/b>/is' => '**$1**',
|
||
'/<strong>(.*?)<\/strong>/is' => '**$1**',
|
||
// Italic
|
||
'/<i>(.*?)<\/i>/is' => '*$1*',
|
||
'/<em>(.*?)<\/em>/is' => '*$1*',
|
||
// Line breaks → actual newlines
|
||
'/<br\s*\/?>/i' => "\n",
|
||
// Paragraphs
|
||
'/<p>(.*?)<\/p>/is' => "$1\n\n",
|
||
// Strip remaining tags
|
||
'/<[^>]+>/' => '',
|
||
];
|
||
|
||
$result = $html;
|
||
foreach ($replacements as $pattern => $replacement) {
|
||
$result = preg_replace($pattern, $replacement, $result) ?? $result;
|
||
}
|
||
|
||
// Decode HTML entities (e.g. & → &)
|
||
$result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
|
||
return $result;
|
||
}
|
||
|
||
/**
|
||
* Parse Markdown-lite subset to HTML.
|
||
*/
|
||
private static function parseMarkdown(string $text): string
|
||
{
|
||
$converter = static::getConverter();
|
||
$result = $converter->convert($text);
|
||
|
||
return (string) $result->getContent();
|
||
}
|
||
|
||
/**
|
||
* Whitelist-based HTML sanitizer.
|
||
* Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes.
|
||
*/
|
||
private static function sanitizeHtml(string $html, bool $allowLinks = true): string
|
||
{
|
||
// Parse with DOMDocument
|
||
$doc = new \DOMDocument('1.0', 'UTF-8');
|
||
// Suppress warnings from malformed fragments
|
||
libxml_use_internal_errors(true);
|
||
$doc->loadHTML(
|
||
'<?xml encoding="UTF-8"><html><body>' . $html . '</body></html>',
|
||
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
|
||
);
|
||
libxml_clear_errors();
|
||
|
||
static::cleanNode($doc->getElementsByTagName('body')->item(0), $allowLinks);
|
||
|
||
// Serialize back, removing the wrapping html/body
|
||
$body = $doc->getElementsByTagName('body')->item(0);
|
||
$inner = '';
|
||
foreach ($body->childNodes as $child) {
|
||
$inner .= $doc->saveHTML($child);
|
||
}
|
||
|
||
// Fix self-closing <a></a> etc.
|
||
return trim($inner);
|
||
}
|
||
|
||
/**
|
||
* Recursively clean a DOMNode — strip forbidden tags/attributes.
|
||
*/
|
||
private static function cleanNode(\DOMNode $node, bool $allowLinks = true): void
|
||
{
|
||
$toRemove = [];
|
||
$toUnwrap = [];
|
||
|
||
foreach ($node->childNodes as $child) {
|
||
if ($child->nodeType === XML_ELEMENT_NODE) {
|
||
if (! $child instanceof \DOMElement) {
|
||
continue;
|
||
}
|
||
|
||
$tag = strtolower($child->nodeName);
|
||
|
||
if (! in_array($tag, self::ALLOWED_TAGS, true)) {
|
||
// Replace element with its text content
|
||
$toUnwrap[] = $child;
|
||
} else {
|
||
// Strip disallowed attributes
|
||
$allowedAttrs = self::ALLOWED_ATTRS[$tag] ?? [];
|
||
$attrsToRemove = [];
|
||
foreach ($child->attributes as $attr) {
|
||
if (! in_array($attr->nodeName, $allowedAttrs, true)) {
|
||
$attrsToRemove[] = $attr->nodeName;
|
||
}
|
||
}
|
||
foreach ($attrsToRemove as $attrName) {
|
||
$child->removeAttribute($attrName);
|
||
}
|
||
|
||
// Force external links to be safe
|
||
if ($tag === 'a') {
|
||
if (! $allowLinks) {
|
||
$toUnwrap[] = $child;
|
||
continue;
|
||
}
|
||
|
||
$href = $child->getAttribute('href');
|
||
if ($href && ! static::isSafeUrl($href)) {
|
||
$toUnwrap[] = $child;
|
||
continue;
|
||
}
|
||
$child->setAttribute('rel', 'noopener noreferrer nofollow ugc');
|
||
$child->setAttribute('target', '_blank');
|
||
}
|
||
|
||
// Recurse
|
||
static::cleanNode($child, $allowLinks);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Unwrap forbidden elements (replace with their children)
|
||
foreach ($toUnwrap as $el) {
|
||
while ($el->firstChild) {
|
||
$node->insertBefore($el->firstChild, $el);
|
||
}
|
||
$node->removeChild($el);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Very conservative URL whitelist.
|
||
*/
|
||
private static function isSafeUrl(string $url): bool
|
||
{
|
||
$lower = strtolower(trim($url));
|
||
|
||
// Allow relative paths and anchors
|
||
if (str_starts_with($url, '/') || str_starts_with($url, '#')) {
|
||
return true;
|
||
}
|
||
|
||
// Only allow http(s)
|
||
return str_starts_with($lower, 'http://') || str_starts_with($lower, 'https://');
|
||
}
|
||
|
||
/**
|
||
* Count Unicode emoji in a string (basic heuristic).
|
||
*/
|
||
private static function countEmoji(string $text): int
|
||
{
|
||
// Match common emoji ranges
|
||
preg_match_all(
|
||
'/[\x{1F300}-\x{1FAD6}\x{2600}-\x{27BF}\x{FE00}-\x{FEFF}]/u',
|
||
$text,
|
||
$matches
|
||
);
|
||
|
||
return count($matches[0]);
|
||
}
|
||
|
||
/**
|
||
* Lazy-load and cache the Markdown converter.
|
||
*/
|
||
private static function getConverter(): MarkdownConverter
|
||
{
|
||
if (static::$converter === null) {
|
||
$env = new Environment([
|
||
'html_input' => 'strip',
|
||
'allow_unsafe_links' => false,
|
||
'max_nesting_level' => 10,
|
||
]);
|
||
$env->addExtension(new CommonMarkCoreExtension());
|
||
$env->addExtension(new AutolinkExtension());
|
||
$env->addExtension(new StrikethroughExtension());
|
||
|
||
static::$converter = new MarkdownConverter($env);
|
||
}
|
||
|
||
return static::$converter;
|
||
}
|
||
}
|