Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 60 additions & 35 deletions web/src/components/dataset/dataset-upload-wizard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
CheckCircle2,
AlertTriangle,
X,
Ban,
} from "lucide-react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
Expand Down Expand Up @@ -162,7 +163,7 @@ export function DatasetUploadWizard({ projectId }: DatasetUploadWizardProps) {
);

const rejected = validationResult?.rejectedRows ?? [];

const isJsonFormat = validationResult?.format === 'json';
// Add DuckDB initialization state
const { isInitializing: isDuckDbInitializing } = useDuckDb();

Expand Down Expand Up @@ -576,12 +577,16 @@ const handleCreateDataset = async () => {
}
};

const handleProceedWithWarnings = () => {
setIsValidationWarningDialogOpen(false);
if (currentStep < WIZARD_STEPS.length) {
setCurrentStep(currentStep + 1);
}
};
const handleProceedWithWarnings = () => {
if (isJsonFormat) {
toast.error("Cannot skip rows for JSON files. Please fix the data types.");
return;
}
setIsValidationWarningDialogOpen(false);
if (currentStep < WIZARD_STEPS.length) {
setCurrentStep(currentStep + 1);
}
};

const handleBack = () => {
if (currentStep > 1) {
Expand Down Expand Up @@ -1570,36 +1575,56 @@ const handleCreateDataset = async () => {
</div>
</div>

<div className="bg-blue-50 dark:bg-blue-950/30 border border-blue-200 dark:border-blue-900 rounded-lg p-4">
<div className="text-sm text-blue-800 dark:text-blue-200">
<strong>What happens if you continue:</strong>
<ul className="mt-2 space-y-1 list-disc list-inside">
<li>
Rows with data type issues will be automatically skipped
</li>
<li>
Your dataset will be created with the remaining valid rows
</li>
</ul>
</div>
</div>
{isJsonFormat ? (
<div className="bg-red-50 dark:bg-red-950/30 border border-red-200 dark:border-red-900 rounded-lg p-4 flex items-start gap-3">
<Ban className="h-5 w-5 text-red-600 shrink-0 mt-0.5" />
<div className="text-sm text-red-800 dark:text-red-200">
<strong>Action Required: Fix Data Types</strong>
<p className="mt-1">
Row skipping is not supported for JSON files due to their
hierarchical structure. You must fix the data type errors in
your source JSON file and re-upload it to proceed.
</p>
</div>
</div>

<div className="flex justify-end gap-3 pt-4">
<Button
variant="outline"
onClick={() => setIsValidationWarningDialogOpen(false)}
>
Go Back to Fix Data
</Button>
<Button
onClick={handleProceedWithWarnings}
className="bg-yellow-600 hover:bg-yellow-700"
>
Continue with {validationResult?.rejectedRows?.length || 0} Rows
Skipped
</Button>
) : (
<div className="bg-blue-50 dark:bg-blue-950/30 border border-blue-200 dark:border-blue-900 rounded-lg p-4">
<div className="text-sm text-blue-800 dark:text-blue-200">
<strong>What happens if you continue:</strong>
<ul className="mt-2 space-y-1 list-disc list-inside">
<li>
Rows with data type issues will be automatically skipped
</li>
<li>
Your dataset will be created with the remaining valid rows
</li>
</ul>
</div>
</div>
)}
</div>

<div className="flex justify-end gap-3 pt-4">
<Button
variant="outline"
onClick={() => setIsValidationWarningDialogOpen(false)}
>
Go Back to Fix Data
</Button>
{isJsonFormat ? (
<Button disabled className="opacity-50 cursor-not-allowed">
Cannot Skip Rows in JSON
</Button>
) : (
<Button
onClick={handleProceedWithWarnings}
className="bg-yellow-600 hover:bg-yellow-700"
>
Continue with {validationResult?.rejectedRows?.length || 0} Rows
Skipped
</Button>
)}
</div>
</DialogContent>
</Dialog>
</div>
Expand Down
196 changes: 159 additions & 37 deletions web/src/lib/validation/validate-file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,18 @@ export async function validateFileWithDuckDb(
// };
// }

// DuckDB auto-detects CSV column types by sampling the file.
// By default, it scans the first 20,480 rows to infer types and other properties.
//
// You can control this behavior using the `sample_size` option:
// - A positive number limits detection to that many rows
// - `sample_size: -1` forces DuckDB to scan the entire file for type detection
//
// Example:
// read_csv('data.csv', { sample_size: -1 })
//
// Docs: https://duckdb.org/docs/stable/data/csv/auto_detection#sample-size

// Route to specific validation function
switch (format) {
case "csv":
Expand Down Expand Up @@ -321,82 +333,192 @@ async function validateParquetFile(
}

/**
* Validates a JSON file
* Helper to extract line number and value from DuckDB JSON errors
*/
function parseDuckDbJsonError(errorMessage: string): RejectedRow | null {
// Example Error:
// Invalid Input Error: JSON transform error in file "...", in line 30904: Could not parse string "not-a-date" according to format...

try {
// 1. Extract Line Number
const lineMatch = errorMessage.match(/in line (\d+)/);
const rowNumber = lineMatch ? parseInt(lineMatch[1], 10) : 0;

// 2. Extract the bad value (usually inside quotes after "Could not parse string")
const valueMatch = errorMessage.match(/Could not parse string "([^"]+)"/);
const actualValue = valueMatch ? valueMatch[1] : "unknown";

// 3. Extract a cleaner error message
let cleanMsg = errorMessage;
if (errorMessage.includes("Invalid Input Error: ")) {
cleanMsg = errorMessage.split("Invalid Input Error: ")[1];
}
// Clean up file names from message
cleanMsg = cleanMsg.replace(/ in file "[^"]+"/, "");

return {
rowNumber: rowNumber,
columnName: "unknown", // JSON errors often don't identify the key name easily
expectedType: "valid format",
actualValue: actualValue,
errorMessage: cleanMsg,
};
} catch {
return null;
}
}

/**
* Validates a JSON file using a "Double Pass" strategy
*/
async function validateJsonFile(
db: duckdb.AsyncDuckDB,
fileArrayBuffer: ArrayBuffer
): Promise<ValidationResult> {
const virtualFileName = `temp_${Date.now()}.json`;
const conn = await db.connect();

// We keep track of the successful format to run the strict check later
let successfulFormat: string | null = null;

const tempTableName = `temp_validate_${Date.now()}`;
let tableCreated = false;

// Error to throw if everything fails (structural issues)
let criticalError: Error | null = null;

try {
await db.registerFileBuffer(
virtualFileName,
new Uint8Array(fileArrayBuffer)
);

const tempTableName = `temp_validate_${Date.now()}`;
// ============================================================
// PASS 1: LENIENT LOAD (Get the data, ignore errors)
// ============================================================
// We try to load data so the user has a preview, even if there are errors.

// 1. Try Auto with ignore_errors
if (!tableCreated) {
try {
await conn.query(`
CREATE TABLE ${tempTableName} AS
SELECT * FROM read_json_auto('${virtualFileName}', ignore_errors=true)
LIMIT 1000000
`);
tableCreated = true;
successfulFormat = "auto";
} catch (e) {
criticalError = e as Error;
}
}

// Try different JSON formats with IGNORE_ERRORS
// Add LIMIT to process only first 1GB worth of data to avoid memory issues
let createQuery = "";
let lastError: Error | null = null;
// 2. Try Newline Delimited with ignore_errors
if (!tableCreated) {
try {
await conn.query(`
CREATE TABLE ${tempTableName} AS
SELECT * FROM read_json_auto('${virtualFileName}', format='newline_delimited', ignore_errors=true)
LIMIT 1000000
`);
tableCreated = true;
successfulFormat = "newline_delimited";
criticalError = null;
} catch (e) {
criticalError = e as Error;
}
}

try {
// Try auto-detection first with IGNORE_ERRORS
createQuery = `CREATE TABLE ${tempTableName} AS SELECT * FROM read_json_auto('${virtualFileName}', ignore_errors=true, store_rejects=true) LIMIT 1000000`;
await conn.query(createQuery);
} catch (autoError) {
lastError = autoError as Error;
// 3. Try Array with ignore_errors
if (!tableCreated) {
try {
// Try newline-delimited JSON
createQuery = `CREATE TABLE ${tempTableName} AS SELECT * FROM read_json_auto('${virtualFileName}', format='newline_delimited', ignore_errors=true, store_rejects=true) LIMIT 1000000`;
await conn.query(createQuery);
lastError = null;
} catch (ndJsonError) {
lastError = ndJsonError as Error;
try {
// Try array format
createQuery = `CREATE TABLE ${tempTableName} AS SELECT * FROM read_json_auto('${virtualFileName}', format='array', ignore_errors=true, store_rejects=true) LIMIT 1000000`;
await conn.query(createQuery);
lastError = null;
} catch (ndJsonError) {
lastError = ndJsonError as Error;
}
await conn.query(`
CREATE TABLE ${tempTableName} AS
SELECT * FROM read_json_auto('${virtualFileName}', format='array', ignore_errors=true)
LIMIT 1000000
`);
tableCreated = true;
successfulFormat = "array";
criticalError = null;
} catch (e) {
criticalError = e as Error;
}
}

if (lastError) {
throw lastError;
// If we couldn't create the table at all, it's a structural failure.
if (!tableCreated) {
throw criticalError || new Error("Could not parse JSON file in any supported format.");
}

const result = await validateTableStructure(conn, tempTableName);
// ============================================================
// PASS 2: STRICT CHECK (Re-create table without ignore_errors)
// ============================================================
// We attempt to create a second temporary table strictly.
// If this fails, we catch the error to report it as a RejectedRow.

const rejectedRows: RejectedRow[] = [];
const strictTableName = `temp_strict_${Date.now()}`;

try {
// Build the query exactly like Pass 1 but remove ignore_errors
let strictQuery = `CREATE TABLE ${strictTableName} AS SELECT * FROM read_json_auto('${virtualFileName}'`;

if (successfulFormat === 'newline_delimited') {
strictQuery += `, format='newline_delimited'`;
} else if (successfulFormat === 'array') {
strictQuery += `, format='array'`;
}
// Note: We deliberately exclude ignore_errors=true here

strictQuery += `) LIMIT 1000000`;

// Run it.
await conn.query(strictQuery);

// If it succeeds, great! No rejected rows.
} catch (strictError) {
const errMsg = (strictError as Error).message;

// We only want to report data transformation errors as "Rejected Rows"
// (Structure errors would have been caught in Pass 1 unless ignore_errors hid them)
if (errMsg.includes("JSON transform error") || errMsg.includes("Could not parse")) {
const rejectedRow = parseDuckDbJsonError(errMsg);
if (rejectedRow) {
rejectedRows.push(rejectedRow);
}
}
} finally {
// Always clean up the strict table if it was created
await conn.query(`DROP TABLE IF EXISTS ${strictTableName}`);
}

// Check for rejected rows
const rejectedRowsResult = await getRejectedRows(conn);
// ============================================================
// FINALIZE
// ============================================================

// Get structure from the successfully created Pass 1 table
const result = await validateTableStructure(conn, tempTableName);

// Clean up the Pass 1 table
await conn.query(`DROP TABLE IF EXISTS ${tempTableName}`);
await conn.close();

return {
...result,
format: "json",
rejectedRows: rejectedRowsResult.rejectedRows,
rejectedRowCount: rejectedRowsResult.rejectedRowCount,
rejectedRows: rejectedRows,
rejectedRowCount: rejectedRows.length,
};

} catch (error) {
await conn.close();
return {
isValid: false,
format: "json",
error: `JSON validation failed: ${
(error as Error).message
}. Supported formats: auto-detect, newline-delimited, array`,
error: `JSON validation failed: ${(error as Error).message}`,
};
}
}

/**
* Validates an Excel file by converting it to CSV first using SheetJS
*/
Expand Down