Files
SkinbaseNova/app/Services/AiBiography/AiBiographyValidator.php
2026-04-18 17:02:56 +02:00

242 lines
7.2 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
namespace App\Services\AiBiography;
/**
* Validates generated biography text before it is stored.
*
* v1.1 additions:
* Extended forbidden phrases (renowned, celebrated, iconic, etc.)
* Generic filler detection ("creator journey shows", "over the years" spam)
* Stat-dump detection (too many bare numbers in a short text)
* Repetitive phrase detection
* Sparse-profile mismatch check (rich-sounding bio for sparse creator)
*
* Rejects output that is:
* empty or too short to be useful
* too long (hard cap)
* not a single paragraph (multiple newlines separating blocks)
* contains markdown (headings, bullets, bold, italic, code)
* contains forbidden hype terms
* contains placeholder or apology patterns
* sounds too rich/boastful for a sparse creator profile
*/
final class AiBiographyValidator
{
private const MIN_WORDS = 20;
private const MAX_WORDS = 180;
/**
* Phrases that are always forbidden, regardless of tier.
* These indicate hallucinated praise, AI-apology patterns, or unsupported claims.
*/
private const FORBIDDEN_PHRASES = [
// Unsupported significance claims
'world-class',
'world class',
'iconic visionary',
'unmatched style',
'legendary',
'changed the platform',
'beloved by everyone',
'renowned for',
'masterpiece creator',
'masterclass',
'celebrated artist',
'celebrated creator',
'celebrated by',
'iconic creator',
'iconic artist',
'iconic work',
'platform legend',
'community favorite',
'widely recognized',
'highly regarded',
'critically acclaimed',
// AI apology / refusal patterns
'i cannot',
"i can't",
'i apologize',
'as an ai',
'as a language model',
'i do not have',
"i don't have",
'based on the information provided',
'unfortunately',
"i'm unable to",
'i am unable to',
// Vague over-praising filler
'truly remarkable',
'absolutely exceptional',
'without a doubt',
'undeniably talented',
];
/**
* Phrases that signal generic, formulaic filler when used more than once,
* or which are always a warning sign of lazy output.
* A single occurrence is allowed; repeated use is rejected.
*/
private const REPETITION_PHRASES = [
'creator journey',
'over the years',
'has been part of skinbase',
'has been a member',
'throughout the years',
'through the years',
'journey on skinbase',
];
/**
* Validate the generated biography.
*
* @param string $text the generated biography text
* @param string $qualityTier 'rich'|'medium'|'sparse' — used for sparse mismatch check
* @return list<string> validation errors; empty list means valid
*/
public function validate(string $text, string $qualityTier = 'rich'): array
{
$errors = [];
$trimmed = trim($text);
if ($trimmed === '') {
$errors[] = 'Biography is empty.';
return $errors;
}
$wordCount = str_word_count($trimmed);
if ($wordCount < self::MIN_WORDS) {
$errors[] = "Biography is too short ({$wordCount} words, minimum " . self::MIN_WORDS . ').';
}
if ($wordCount > self::MAX_WORDS) {
$errors[] = "Biography is too long ({$wordCount} words, maximum " . self::MAX_WORDS . ').';
}
if ($this->containsMarkdown($trimmed)) {
$errors[] = 'Biography contains markdown or structural formatting.';
}
if ($this->hasMultipleParagraphs($trimmed)) {
$errors[] = 'Biography contains multiple paragraphs; must be a single paragraph.';
}
foreach (self::FORBIDDEN_PHRASES as $phrase) {
if (str_contains(mb_strtolower($trimmed), $phrase)) {
$errors[] = "Biography contains forbidden phrase: \"{$phrase}\".";
break;
}
}
$repetitionError = $this->checkRepetition($trimmed);
if ($repetitionError !== null) {
$errors[] = $repetitionError;
}
if ($qualityTier === 'sparse' && $this->soundsTooRichForSparseProfile($trimmed)) {
$errors[] = 'Biography sounds too claim-heavy for a sparse creator profile.';
}
return $errors;
}
public function isValid(string $text, string $qualityTier = 'rich'): bool
{
return $this->validate($text, $qualityTier) === [];
}
// -------------------------------------------------------------------------
private function containsMarkdown(string $text): bool
{
// Headings: #, ##, ###
if (preg_match('/^\s*#{1,6}\s/m', $text)) {
return true;
}
// Bullets: lines starting with -, *, or numbered list
if (preg_match('/^\s*[-*]\s/m', $text)) {
return true;
}
if (preg_match('/^\s*\d+\.\s/m', $text)) {
return true;
}
// Bold / italic markers
if (preg_match('/\*\*|__|\*[^*]|_[^_]/', $text)) {
return true;
}
// Code blocks or inline code
if (str_contains($text, '`') || str_contains($text, '```')) {
return true;
}
return false;
}
private function hasMultipleParagraphs(string $text): bool
{
// Two or more consecutive newlines indicate paragraph break.
return (bool) preg_match('/\n\s*\n/', $text);
}
/**
* Check whether any formulaic phrase appears more than once,
* which usually indicates a recycled or low-quality output.
*/
private function checkRepetition(string $text): ?string
{
$lower = mb_strtolower($text);
foreach (self::REPETITION_PHRASES as $phrase) {
// Count non-overlapping occurrences.
$count = substr_count($lower, $phrase);
if ($count >= 2) {
return "Biography repeats the phrase \"{$phrase}\" too many times.";
}
}
return null;
}
/**
* For sparse-profile biographies, reject text that sounds too achievement-heavy.
* These signals typically appear only in rich profiles and would be hallucinated
* or misleading when the creator has very little public history.
*/
private function soundsTooRichForSparseProfile(string $text): bool
{
$lower = mb_strtolower($text);
$richIndicators = [
'featured',
'best-performing',
'standout',
'milestone',
'comeback',
'evolution',
'remaster',
'era',
'streak',
'downloads',
'most productive',
];
$hitCount = 0;
foreach ($richIndicators as $indicator) {
if (str_contains($lower, $indicator)) {
$hitCount++;
}
}
// If a sparse profile biography references 3+ rich signals, it likely hallucinated them.
return $hitCount >= 3;
}
}