242 lines
7.2 KiB
PHP
242 lines
7.2 KiB
PHP
<?php
|
||
|
||
declare(strict_types=1);
|
||
|
||
namespace App\Services\AiBiography;
|
||
|
||
/**
|
||
* Validates generated biography text before it is stored.
|
||
*
|
||
* v1.1 additions:
|
||
* – Extended forbidden phrases (renowned, celebrated, iconic, etc.)
|
||
* – Generic filler detection ("creator journey shows", "over the years" spam)
|
||
* – Stat-dump detection (too many bare numbers in a short text)
|
||
* – Repetitive phrase detection
|
||
* – Sparse-profile mismatch check (rich-sounding bio for sparse creator)
|
||
*
|
||
* Rejects output that is:
|
||
* – empty or too short to be useful
|
||
* – too long (hard cap)
|
||
* – not a single paragraph (multiple newlines separating blocks)
|
||
* – contains markdown (headings, bullets, bold, italic, code)
|
||
* – contains forbidden hype terms
|
||
* – contains placeholder or apology patterns
|
||
* – sounds too rich/boastful for a sparse creator profile
|
||
*/
|
||
final class AiBiographyValidator
|
||
{
|
||
private const MIN_WORDS = 20;
|
||
private const MAX_WORDS = 180;
|
||
|
||
/**
|
||
* Phrases that are always forbidden, regardless of tier.
|
||
* These indicate hallucinated praise, AI-apology patterns, or unsupported claims.
|
||
*/
|
||
private const FORBIDDEN_PHRASES = [
|
||
// Unsupported significance claims
|
||
'world-class',
|
||
'world class',
|
||
'iconic visionary',
|
||
'unmatched style',
|
||
'legendary',
|
||
'changed the platform',
|
||
'beloved by everyone',
|
||
'renowned for',
|
||
'masterpiece creator',
|
||
'masterclass',
|
||
'celebrated artist',
|
||
'celebrated creator',
|
||
'celebrated by',
|
||
'iconic creator',
|
||
'iconic artist',
|
||
'iconic work',
|
||
'platform legend',
|
||
'community favorite',
|
||
'widely recognized',
|
||
'highly regarded',
|
||
'critically acclaimed',
|
||
// AI apology / refusal patterns
|
||
'i cannot',
|
||
"i can't",
|
||
'i apologize',
|
||
'as an ai',
|
||
'as a language model',
|
||
'i do not have',
|
||
"i don't have",
|
||
'based on the information provided',
|
||
'unfortunately',
|
||
"i'm unable to",
|
||
'i am unable to',
|
||
// Vague over-praising filler
|
||
'truly remarkable',
|
||
'absolutely exceptional',
|
||
'without a doubt',
|
||
'undeniably talented',
|
||
];
|
||
|
||
/**
|
||
* Phrases that signal generic, formulaic filler when used more than once,
|
||
* or which are always a warning sign of lazy output.
|
||
* A single occurrence is allowed; repeated use is rejected.
|
||
*/
|
||
private const REPETITION_PHRASES = [
|
||
'creator journey',
|
||
'over the years',
|
||
'has been part of skinbase',
|
||
'has been a member',
|
||
'throughout the years',
|
||
'through the years',
|
||
'journey on skinbase',
|
||
];
|
||
|
||
/**
|
||
* Validate the generated biography.
|
||
*
|
||
* @param string $text the generated biography text
|
||
* @param string $qualityTier 'rich'|'medium'|'sparse' — used for sparse mismatch check
|
||
* @return list<string> validation errors; empty list means valid
|
||
*/
|
||
public function validate(string $text, string $qualityTier = 'rich'): array
|
||
{
|
||
$errors = [];
|
||
|
||
$trimmed = trim($text);
|
||
|
||
if ($trimmed === '') {
|
||
$errors[] = 'Biography is empty.';
|
||
return $errors;
|
||
}
|
||
|
||
$wordCount = str_word_count($trimmed);
|
||
|
||
if ($wordCount < self::MIN_WORDS) {
|
||
$errors[] = "Biography is too short ({$wordCount} words, minimum " . self::MIN_WORDS . ').';
|
||
}
|
||
|
||
if ($wordCount > self::MAX_WORDS) {
|
||
$errors[] = "Biography is too long ({$wordCount} words, maximum " . self::MAX_WORDS . ').';
|
||
}
|
||
|
||
if ($this->containsMarkdown($trimmed)) {
|
||
$errors[] = 'Biography contains markdown or structural formatting.';
|
||
}
|
||
|
||
if ($this->hasMultipleParagraphs($trimmed)) {
|
||
$errors[] = 'Biography contains multiple paragraphs; must be a single paragraph.';
|
||
}
|
||
|
||
foreach (self::FORBIDDEN_PHRASES as $phrase) {
|
||
if (str_contains(mb_strtolower($trimmed), $phrase)) {
|
||
$errors[] = "Biography contains forbidden phrase: \"{$phrase}\".";
|
||
break;
|
||
}
|
||
}
|
||
|
||
$repetitionError = $this->checkRepetition($trimmed);
|
||
if ($repetitionError !== null) {
|
||
$errors[] = $repetitionError;
|
||
}
|
||
|
||
if ($qualityTier === 'sparse' && $this->soundsTooRichForSparseProfile($trimmed)) {
|
||
$errors[] = 'Biography sounds too claim-heavy for a sparse creator profile.';
|
||
}
|
||
|
||
return $errors;
|
||
}
|
||
|
||
public function isValid(string $text, string $qualityTier = 'rich'): bool
|
||
{
|
||
return $this->validate($text, $qualityTier) === [];
|
||
}
|
||
|
||
// -------------------------------------------------------------------------
|
||
|
||
private function containsMarkdown(string $text): bool
|
||
{
|
||
// Headings: #, ##, ###
|
||
if (preg_match('/^\s*#{1,6}\s/m', $text)) {
|
||
return true;
|
||
}
|
||
|
||
// Bullets: lines starting with -, *, or numbered list
|
||
if (preg_match('/^\s*[-*]\s/m', $text)) {
|
||
return true;
|
||
}
|
||
|
||
if (preg_match('/^\s*\d+\.\s/m', $text)) {
|
||
return true;
|
||
}
|
||
|
||
// Bold / italic markers
|
||
if (preg_match('/\*\*|__|\*[^*]|_[^_]/', $text)) {
|
||
return true;
|
||
}
|
||
|
||
// Code blocks or inline code
|
||
if (str_contains($text, '`') || str_contains($text, '```')) {
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
private function hasMultipleParagraphs(string $text): bool
|
||
{
|
||
// Two or more consecutive newlines indicate paragraph break.
|
||
return (bool) preg_match('/\n\s*\n/', $text);
|
||
}
|
||
|
||
/**
|
||
* Check whether any formulaic phrase appears more than once,
|
||
* which usually indicates a recycled or low-quality output.
|
||
*/
|
||
private function checkRepetition(string $text): ?string
|
||
{
|
||
$lower = mb_strtolower($text);
|
||
|
||
foreach (self::REPETITION_PHRASES as $phrase) {
|
||
// Count non-overlapping occurrences.
|
||
$count = substr_count($lower, $phrase);
|
||
if ($count >= 2) {
|
||
return "Biography repeats the phrase \"{$phrase}\" too many times.";
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* For sparse-profile biographies, reject text that sounds too achievement-heavy.
|
||
* These signals typically appear only in rich profiles and would be hallucinated
|
||
* or misleading when the creator has very little public history.
|
||
*/
|
||
private function soundsTooRichForSparseProfile(string $text): bool
|
||
{
|
||
$lower = mb_strtolower($text);
|
||
|
||
$richIndicators = [
|
||
'featured',
|
||
'best-performing',
|
||
'standout',
|
||
'milestone',
|
||
'comeback',
|
||
'evolution',
|
||
'remaster',
|
||
'era',
|
||
'streak',
|
||
'downloads',
|
||
'most productive',
|
||
];
|
||
|
||
$hitCount = 0;
|
||
foreach ($richIndicators as $indicator) {
|
||
if (str_contains($lower, $indicator)) {
|
||
$hitCount++;
|
||
}
|
||
}
|
||
|
||
// If a sparse profile biography references 3+ rich signals, it likely hallucinated them.
|
||
return $hitCount >= 3;
|
||
}
|
||
}
|