Skip to content

Commit c1fae4c

Browse files
awehttamclaude
andcommitted
Fix PostgreSQL encoding error when TIC FILE_ID.DIZ contains CP437 characters
Adds sanitizeToUtf8() helper to TicFileProcessor that converts CP437/ISO-8859-1 encoded text to valid UTF-8 before database insertion. Applied to DIZ content, short/long descriptions, and From address fields. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent fe9722d commit c1fae4c

1 file changed

Lines changed: 31 additions & 3 deletions

File tree

src/TicFileProcessor.php

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ protected function enrichFromFileIdDiz(array &$ticData, string $filePath): void
232232
return;
233233
}
234234

235+
$dizContent = $this->sanitizeToUtf8($dizContent);
235236
$dizContent = str_replace(["\r\n", "\r"], "\n", $dizContent);
236237
$lines = array_values(array_filter(array_map('rtrim', explode("\n", $dizContent)), fn($l) => $l !== ''));
237238

@@ -597,13 +598,13 @@ protected function storeFile(array $ticData, string $tempFilePath, array $fileAr
597598
$fileSize = filesize($storagePath);
598599

599600
// Build descriptions
600-
$shortDesc = $ticData['Desc'] ?? '';
601-
$longDesc = !empty($ticData['LDesc']) ? implode("\n", $ticData['LDesc']) : '';
601+
$shortDesc = $this->sanitizeToUtf8($ticData['Desc'] ?? '');
602+
$longDesc = !empty($ticData['LDesc']) ? $this->sanitizeToUtf8(implode("\n", $ticData['LDesc'])) : '';
602603

603604
// Truncate fields to fit database constraints (VARCHAR 255)
604605
$filename = mb_substr($filename, 0, 255);
605606
$shortDesc = mb_substr($shortDesc, 0, 255);
606-
$fromAddress = mb_substr($ticData['From'] ?? '', 0, 255);
607+
$fromAddress = mb_substr($this->sanitizeToUtf8($ticData['From'] ?? ''), 0, 255);
607608

608609
// Store in database
609610
$stmt = $this->db->prepare("
@@ -744,6 +745,33 @@ protected function scanFileForViruses(int $fileId, array $fileArea): array
744745
return $result;
745746
}
746747

748+
/**
749+
* Convert a string to valid UTF-8, trying CP437 first (common FidoNet/DOS encoding).
750+
* Invalid bytes are dropped rather than causing a PostgreSQL encoding error.
751+
*
752+
* @param string $text Raw input that may be CP437, ISO-8859-1, or already UTF-8
753+
* @return string Valid UTF-8 string
754+
*/
755+
private function sanitizeToUtf8(string $text): string
756+
{
757+
if (mb_check_encoding($text, 'UTF-8')) {
758+
return $text;
759+
}
760+
761+
if (function_exists('iconv')) {
762+
$converted = @iconv('CP437', 'UTF-8//IGNORE', $text);
763+
if ($converted !== false) {
764+
return $converted;
765+
}
766+
$converted = @iconv('ISO-8859-1', 'UTF-8//IGNORE', $text);
767+
if ($converted !== false) {
768+
return $converted;
769+
}
770+
}
771+
772+
return mb_convert_encoding($text, 'UTF-8', 'CP437');
773+
}
774+
747775
/**
748776
* Generate a unique hash if duplicates are allowed in this area
749777
*

0 commit comments

Comments
 (0)