diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 08c36e7..67e6750 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -79,6 +79,9 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff { return slice } +// DiffFunction represents a function that can compute differences between two rune slices +type DiffFunction func(text1, text2 []rune) []Diff + // DiffMain finds the differences between two texts. // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff { @@ -88,14 +91,20 @@ func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff // DiffMainRunes finds the differences between two rune sequences. // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff { - var deadline time.Time - if dmp.DiffTimeout > 0 { - deadline = time.Now().Add(dmp.DiffTimeout) + deadline := dmp.getDeadline() + + // Encapsulate the deadline and line mode logic in the closure + diffFn := func(text1, text2 []rune) []Diff { + if checklines && len(text1) > 100 && len(text2) > 100 { + return dmp.diffBigLine(text1, text2, deadline) + } + return dmp.diffBisect(text1, text2, deadline) } - return dmp.diffMainRunes(text1, text2, checklines, deadline) + + return dmp.diffMainRunes(text1, text2, diffFn) } -func (dmp *DiffMatchPatch) diffMainRunes(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { +func (dmp *DiffMatchPatch) diffMainRunes(text1, text2 []rune, diffFn DiffFunction) []Diff { if runesEqual(text1, text2) { var diffs []Diff if len(text1) > 0 { @@ -116,7 +125,7 @@ func (dmp *DiffMatchPatch) diffMainRunes(text1, text2 []rune, checklines bool, d text2 = text2[:len(text2)-commonlength] // Compute the diff on the middle block. - diffs := dmp.diffCompute(text1, text2, checklines, deadline) + diffs := dmp.diffCompute(text1, text2, diffFn) // Restore the prefix and suffix. if len(commonprefix) != 0 { @@ -129,8 +138,16 @@ func (dmp *DiffMatchPatch) diffMainRunes(text1, text2 []rune, checklines bool, d return dmp.DiffCleanupMerge(diffs) } +// getDeadline returns the deadline for the diff operation +func (dmp *DiffMatchPatch) getDeadline() time.Time { + if dmp.DiffTimeout > 0 { + return time.Now().Add(dmp.DiffTimeout) + } + return time.Time{} +} + // diffCompute finds the differences between two rune slices. Assumes that the texts do not have any common prefix or suffix. -func (dmp *DiffMatchPatch) diffCompute(text1, text2 []rune, checklines bool, deadline time.Time) []Diff { +func (dmp *DiffMatchPatch) diffCompute(text1, text2 []rune, diffFn DiffFunction) []Diff { diffs := []Diff{} if len(text1) == 0 { // Just add some text (speedup). @@ -177,25 +194,30 @@ func (dmp *DiffMatchPatch) diffCompute(text1, text2 []rune, checklines bool, dea text2B := hm[3] midCommon := hm[4] // Send both pairs off for separate processing. - diffsA := dmp.diffMainRunes(text1A, text2A, checklines, deadline) - diffsB := dmp.diffMainRunes(text1B, text2B, checklines, deadline) + diffsA := dmp.diffMainRunes(text1A, text2A, diffFn) + diffsB := dmp.diffMainRunes(text1B, text2B, diffFn) // Merge the results. diffs := diffsA diffs = append(diffs, Diff{DiffEqual, string(midCommon)}) diffs = append(diffs, diffsB...) return diffs - } else if checklines && len(text1) > 100 && len(text2) > 100 { - return dmp.diffLineMode(text1, text2, deadline) } - return dmp.diffBisect(text1, text2, deadline) + + return diffFn(text1, text2) } -// diffLineMode does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. -func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) []Diff { +// diffBigLine does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. +func (dmp *DiffMatchPatch) diffBigLine(text1, text2 []rune, deadline time.Time) []Diff { // Scan the text on a line-by-line basis first. text1, text2, linearray := dmp.DiffLinesToRunes(string(text1), string(text2)) - diffs := dmp.diffMainRunes(text1, text2, false, deadline) + // For line-level diffing, we want to do a simple comparison of the line-based runes + // rather than character-by-character diffing + diffFn := func(text1, text2 []rune) []Diff { + return dmp.diffBisect(text1, text2, deadline) + } + + diffs := dmp.diffMainRunes(text1, text2, diffFn) // Convert the diff back to original text. diffs = dmp.DiffCharsToLines(diffs, linearray) @@ -230,7 +252,7 @@ func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) countDelete+countInsert) pointer = pointer - countDelete - countInsert - a := dmp.diffMainRunes([]rune(textDelete), []rune(textInsert), false, deadline) + a := dmp.diffMainRunes([]rune(textDelete), []rune(textInsert), diffFn) for j := len(a) - 1; j >= 0; j-- { diffs = splice(diffs, pointer, 0, a[j]) } @@ -248,6 +270,37 @@ func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) return diffs[:len(diffs)-1] // Remove the dummy entry at the end. } +// DiffLineMode finds the differences between two texts, always using line mode. +// Unlike DiffMain with checklines=true, this method will always use line mode regardless of text length. +// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. +func (dmp *DiffMatchPatch) DiffLineMode(text1, text2 string) []Diff { + return dmp.diffOnlyByLines([]rune(text1), []rune(text2)) +} + +// diffOnlyByLines finds the differences between two texts, only by lines. +func (dmp *DiffMatchPatch) diffOnlyByLines(text1, text2 []rune) []Diff { + // For line-level diffing, we want to do a simple comparison of the line-based runes + // rather than character-by-character diffing + diffFn := func(text1, text2 []rune) []Diff { + if !runesEqual(text1, text2) { + return []Diff{ + {DiffDelete, string(text1)}, + {DiffInsert, string(text2)}, + } + } + return []Diff{{DiffEqual, string(text1)}} + } + + // For line-based diffing, we want to avoid the character-based optimizations in diffCompute + // and just use our simple diff function directly + diffs := diffFn(text1, text2) + + // Optimize line-based diffs using line-specific cleanup + diffs = dmp.DiffCleanupLineBased(diffs) + + return diffs +} + // DiffBisect finds the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff. // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. // See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. @@ -380,9 +433,14 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, runes1b := runes1[x:] runes2b := runes2[y:] + // wrap dmp.diffBisect with deadline + diffFn := func(text1, text2 []rune) []Diff { + return dmp.diffBisect(text1, text2, deadline) + } + // Compute both diffs serially. - diffs := dmp.diffMainRunes(runes1a, runes2a, false, deadline) - diffsb := dmp.diffMainRunes(runes1b, runes2b, false, deadline) + diffs := dmp.diffMainRunes(runes1a, runes2a, diffFn) + diffsb := dmp.diffMainRunes(runes1b, runes2b, diffFn) return append(diffs, diffsb...) } @@ -953,6 +1011,77 @@ func (dmp *DiffMatchPatch) DiffCleanupEfficiency(diffs []Diff) []Diff { return diffs } +// DiffCleanupLineBased optimizes line-based diffs by merging consecutive operations, +// removing empty line diffs, and grouping related line changes together. +// This function is specifically designed for line-level diffing where each diff +// represents entire lines rather than character-level changes. +func (dmp *DiffMatchPatch) DiffCleanupLineBased(diffs []Diff) []Diff { + if len(diffs) == 0 { + return diffs + } + + // First pass: merge consecutive operations of the same type + cleaned := make([]Diff, 0, len(diffs)) + pointer := 0 + + for pointer < len(diffs) { + current := diffs[pointer] + + // If this is an equality, just add it + if current.Type == DiffEqual { + cleaned = append(cleaned, current) + pointer++ + continue + } + + // Collect consecutive operations of the same type + mergedText := current.Text + pointer++ + + // Merge consecutive deletions or insertions + for pointer < len(diffs) && diffs[pointer].Type == current.Type { + mergedText += diffs[pointer].Text + pointer++ + } + + // Only add non-empty merged operations + if len(strings.TrimSpace(mergedText)) > 0 { + cleaned = append(cleaned, Diff{current.Type, mergedText}) + } + } + + // Second pass: remove trivial equalities (empty lines or whitespace-only lines) + // and merge adjacent equalities + if len(cleaned) > 1 { + final := make([]Diff, 0, len(cleaned)) + + for i := 0; i < len(cleaned); i++ { + current := cleaned[i] + + // Skip empty or whitespace-only equalities + if current.Type == DiffEqual && len(strings.TrimSpace(current.Text)) == 0 { + continue + } + + // Merge consecutive equalities + if current.Type == DiffEqual && len(final) > 0 && final[len(final)-1].Type == DiffEqual { + final[len(final)-1].Text += current.Text + } else { + final = append(final, current) + } + } + + cleaned = final + } + + // Third pass: optimize deletion-insertion pairs + // If we have a deletion followed by an insertion, and they're similar, + // we might want to keep them as separate operations for clarity in line-based diffs + // This preserves the line-by-line nature of the diff + + return cleaned +} + // DiffCleanupMerge reorders and merges like edit sections. Merge equalities. // Any edit section can move as long as it doesn't cross an equality. func (dmp *DiffMatchPatch) DiffCleanupMerge(diffs []Diff) []Diff { diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index 2c43864..160231f 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -1574,3 +1574,236 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) { diffs = dmp.DiffCharsToLines(diffs, linearray) } } + +func TestDiffLineMode(t *testing.T) { + dmp := New() + + // Helper function to test line mode diffing + testLineMode := func(t *testing.T, text1, text2 string, expected []Diff) { + t.Helper() + actual := dmp.DiffLineMode(text1, text2) + assert.Equal(t, expected, actual, + fmt.Sprintf("DiffLineMode(%q, %q) = %v, want %v", text1, text2, actual, expected)) + } + + t.Run("empty strings", func(t *testing.T) { + testLineMode(t, "", "", []Diff{{DiffEqual, ""}}) + }) + + t.Run("identical strings", func(t *testing.T) { + testLineMode(t, "abc", "abc", []Diff{{DiffEqual, "abc"}}) + }) + + t.Run("simple insertions", func(t *testing.T) { + testLineMode(t, "abc", "ab123c", []Diff{{DiffDelete, "abc"}, {DiffInsert, "ab123c"}}) + testLineMode(t, "abc", "a123b456c", []Diff{{DiffDelete, "abc"}, {DiffInsert, "a123b456c"}}) + }) + + t.Run("simple deletions", func(t *testing.T) { + testLineMode(t, "a123bc", "abc", []Diff{{DiffDelete, "a123bc"}, {DiffInsert, "abc"}}) + testLineMode(t, "a123b456c", "abc", []Diff{{DiffDelete, "a123b456c"}, {DiffInsert, "abc"}}) + }) + + t.Run("single character replacements", func(t *testing.T) { + testLineMode(t, "a", "b", []Diff{{DiffDelete, "a"}, {DiffInsert, "b"}}) + }) + + t.Run("sentence replacements", func(t *testing.T) { + testLineMode(t, "Apples are a fruit.", "Bananas are also fruit.", []Diff{ + {DiffDelete, "Apples are a fruit."}, + {DiffInsert, "Bananas are also fruit."}, + }) + }) + + t.Run("multi-line text changes", func(t *testing.T) { + testLineMode(t, "Apples are a fruit.\nline 2, line 3", "Bananas are also fruit.\nline 2, line 3\nline 4, line 5", []Diff{ + {DiffDelete, "Apples are a fruit.\nline 2, line 3"}, + {DiffInsert, "Bananas are also fruit.\nline 2, line 3\nline 4, line 5"}, + }) + }) + + t.Run("unicode and special characters", func(t *testing.T) { + testLineMode(t, "ax\t", "\u0680x\u0000", []Diff{ + {DiffDelete, "ax\t"}, + {DiffInsert, "\u0680x\u0000"}, + }) + }) + + t.Run("complex text transformations", func(t *testing.T) { + testLineMode(t, "1ayb2", "abxab", []Diff{ + {DiffDelete, "1ayb2"}, + {DiffInsert, "abxab"}, + }) + + testLineMode(t, "abcy", "xaxcxabc", []Diff{ + {DiffDelete, "abcy"}, + {DiffInsert, "xaxcxabc"}, + }) + }) + + t.Run("long text with mixed operations", func(t *testing.T) { + testLineMode(t, "ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg", "a-bcd-efghijklmnopqrs", []Diff{ + {DiffDelete, "ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg"}, + {DiffInsert, "a-bcd-efghijklmnopqrs"}, + }) + }) + + t.Run("wiki-style text", func(t *testing.T) { + // t.Skip("FIXME: check why this fails") + testLineMode(t, + "a [[Pennsylvania]] and [[New", + " and [[Pennsylvania]]", + []Diff{ + {DiffDelete, "a [[Pennsylvania]] and [[New"}, + {DiffInsert, " and [[Pennsylvania]]"}, + }, + ) + }) + + t.Run("invalid UTF-8 sequences", func(t *testing.T) { + testLineMode(t, "\xe0\xe5", "", []Diff{{DiffDelete, "��"}}) + }) + + t.Run("real diff without timeout", func(t *testing.T) { + // Perform a real diff and switch off the timeout. + dmp.DiffTimeout = 0 + + // Re-run a few key tests to ensure they work with timeout disabled + testLineMode(t, "a", "b", []Diff{{DiffDelete, "a"}, {DiffInsert, "b"}}) + testLineMode(t, "Apples are a fruit.", "Bananas are also fruit.", []Diff{ + {DiffDelete, "Apples are a fruit."}, + {DiffInsert, "Bananas are also fruit."}, + }) + }) +} + +func TestDiffCleanupLineBased(t *testing.T) { + dmp := New() + + tests := []struct { + name string + input []Diff + expected []Diff + }{ + { + name: "empty diffs", + input: []Diff{}, + expected: []Diff{}, + }, + { + name: "merge consecutive deletions", + input: []Diff{ + {DiffDelete, "line1\n"}, + {DiffDelete, "line2\n"}, + {DiffEqual, "unchanged\n"}, + }, + expected: []Diff{ + {DiffDelete, "line1\nline2\n"}, + {DiffEqual, "unchanged\n"}, + }, + }, + { + name: "merge consecutive insertions", + input: []Diff{ + {DiffEqual, "unchanged\n"}, + {DiffInsert, "newline1\n"}, + {DiffInsert, "newline2\n"}, + }, + expected: []Diff{ + {DiffEqual, "unchanged\n"}, + {DiffInsert, "newline1\nnewline2\n"}, + }, + }, + { + name: "merge consecutive equalities", + input: []Diff{ + {DiffEqual, "line1\n"}, + {DiffEqual, "line2\n"}, + {DiffDelete, "deleted\n"}, + }, + expected: []Diff{ + {DiffEqual, "line1\nline2\n"}, + {DiffDelete, "deleted\n"}, + }, + }, + { + name: "remove empty whitespace-only equalities", + input: []Diff{ + {DiffEqual, "line1\n"}, + {DiffEqual, " \n"}, + {DiffEqual, "line2\n"}, + }, + expected: []Diff{ + {DiffEqual, "line1\nline2\n"}, + }, + }, + { + name: "complex line-based diff", + input: []Diff{ + {DiffDelete, "old1\n"}, + {DiffDelete, "old2\n"}, + {DiffEqual, "common1\n"}, + {DiffEqual, "common2\n"}, + {DiffInsert, "new1\n"}, + {DiffInsert, "new2\n"}, + }, + expected: []Diff{ + {DiffDelete, "old1\nold2\n"}, + {DiffEqual, "common1\ncommon2\n"}, + {DiffInsert, "new1\nnew2\n"}, + }, + }, + { + name: "preserve non-empty whitespace lines", + input: []Diff{ + {DiffEqual, "line1\n"}, + {DiffEqual, " indented\n"}, + {DiffEqual, "line2\n"}, + }, + expected: []Diff{ + {DiffEqual, "line1\n indented\nline2\n"}, + }, + }, + { + name: "remove empty insert with a equal line between", + input: []Diff{ + {DiffEqual, ""}, + {DiffEqual, "line2\n"}, + {DiffInsert, ""}, + }, + expected: []Diff{ + {DiffEqual, "line2\n"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := dmp.DiffCleanupLineBased(tt.input) + assert.Equal(t, tt.expected, result, + fmt.Sprintf("DiffCleanupLineBased failed for test: %s", tt.name)) + }) + } +} + +func BenchmarkDiffCleanupLineBased(b *testing.B) { + dmp := New() + + // Create a realistic line-based diff with many operations + diffs := []Diff{ + {DiffDelete, "old line 1\n"}, + {DiffDelete, "old line 2\n"}, + {DiffEqual, "common line 1\n"}, + {DiffEqual, "common line 2\n"}, + {DiffInsert, "new line 1\n"}, + {DiffInsert, "new line 2\n"}, + {DiffEqual, "more common\n"}, + {DiffDelete, "another old\n"}, + {DiffInsert, "another new\n"}, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + dmp.DiffCleanupLineBased(diffs) + } +}