diff --git a/.gitignore b/.gitignore index 6d8cb8b622..5766bc7787 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ TAGS doc/devel/ doc/html/ .deps +.dirstamp libtool make.log make.clang @@ -57,3 +58,4 @@ tests/src/editor/test-data.txt src/vfs/extfs/helpers/usqfs tests/src/vfs/extfs/helpers-list/data/config.sh mc-version.h +/ts-grammars-shared/ diff --git a/acinclude.m4 b/acinclude.m4 index 685962da4a..210e83597b 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -17,6 +17,7 @@ m4_include([m4.include/mc-with-x.m4]) m4_include([m4.include/mc-use-termcap.m4]) m4_include([m4.include/mc-with-screen.m4]) m4_include([m4.include/mc-with-internal-edit.m4]) +m4_include([m4.include/mc-with-tree-sitter.m4]) m4_include([m4.include/mc-subshell.m4]) m4_include([m4.include/mc-background.m4]) m4_include([m4.include/mc-ext2fs-attr.m4]) diff --git a/configure.ac b/configure.ac index 5a5155f9e0..cda268f01c 100644 --- a/configure.ac +++ b/configure.ac @@ -41,6 +41,7 @@ dnl that ar cannot be found and linking via libtool will fail at a later stage AC_CHECK_TOOLS([AR], [ar gar]) AC_PROG_CC +AC_PROG_CXX # AC_PROG_CC doesn't try enabling C99 in autoconf 2.69 and below, but # AC_PROG_CC_C99 is deprecated in newer ones. In autoconf 2.70+ both @@ -536,11 +537,18 @@ AC_SUBST(CPPFLAGS) AC_SUBST(LDFLAGS) AC_SUBST(LIBS) +dnl ############################################################################ +dnl Syntax highlighting system selection +dnl ############################################################################ + +mc_WITH_TREE_SITTER + AM_CONDITIONAL(USE_NLS, [test x"$USE_NLS" = xyes]) AM_CONDITIONAL(USE_MAINTAINER_MODE, [test x"$USE_MAINTAINER_MODE" = xyes]) AM_CONDITIONAL(USE_SCREEN_SLANG, [test x"$with_screen" = xslang]) AM_CONDITIONAL(USE_INTERNAL_EDIT, [test x"$use_internal_edit" = xyes ]) AM_CONDITIONAL(USE_ASPELL, [test x"$enable_aspell" = xyes ]) +AM_CONDITIONAL(USE_TREE_SITTER, [test x"$with_tree_sitter" = xyes]) AM_CONDITIONAL(USE_DIFF, [test -n "$use_diff"]) AM_CONDITIONAL(CONS_SAVER, [test -n "$cons_saver"]) dnl Clarify do we really need GModule @@ -654,6 +662,7 @@ lib/vfs/Makefile lib/widget/Makefile misc/syntax/Makefile +misc/syntax-ts/Makefile doc/Makefile diff --git a/doc/Makefile.am b/doc/Makefile.am index dcf6f84740..de69ac6b54 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,4 +1,4 @@ DIST_SUBDIRS = hints man hlp SUBDIRS = hints man hlp -EXTRA_DIST = FAQ HACKING INSTALL +EXTRA_DIST = FAQ HACKING INSTALL TREE-SITTER diff --git a/doc/TREE-SITTER b/doc/TREE-SITTER new file mode 100644 index 0000000000..e351acd4a2 --- /dev/null +++ b/doc/TREE-SITTER @@ -0,0 +1,608 @@ +Tree-sitter syntax highlighting for the internal editor +------------------------------------------------------- + +Contents +-------- + +* Introduction +* Building with tree-sitter support +* Installing grammars +* How it works +* Language injection +* Wrapper grammars +* Parse error highlighting +* Syntax highlighting modes +* File layout +* Per-grammar configuration (config.ini) +* Highlight query files +* Color mapping +* Adding a new language +* Removing or disabling a language +* Updating grammars +* Validating query files +* Testing with mc-syntax-dump +* Troubleshooting +* Limitations + + +Introduction +------------ + +The internal editor (mcedit) supports an alternative syntax highlighting +backend based on tree-sitter (https://tree-sitter.github.io/). When +enabled at build time, tree-sitter provides AST-based highlighting that +is more accurate than the legacy regex-based system. + +The tree-sitter backend is enabled with the --with-tree-sitter configure +flag. When a file is opened in the editor, the system first attempts to +match it against a tree-sitter grammar. If no grammar is found (or the +query file fails to compile), the editor falls back to the legacy +regex-based highlighting from *.syntax files. + +The two highlighting systems share the same rendering path: both produce +a color pair integer per byte, which is consumed by editdraw.c. + +MC ships no grammar data. Grammar libraries (.so), query files (.scm), +and per-grammar configuration (config.ini) are provided by the external +mc-ts-grammars repository (https://github.com/jtyr/mc-ts-grammars) and +installed via the mc-ts-grammar tool or by distro packages. + +68 languages have MC-curated query files and configuration. Distros or +users can add support for additional languages by providing their own +query files and config.ini. + + +Building with tree-sitter support +--------------------------------- + +Requirements: + +- libtree-sitter >= 0.22 (development headers and shared library) + - https://github.com/tree-sitter/tree-sitter +- gmodule-2.0 (part of GLib, for loading grammar .so modules) + +Build: + + ./configure --with-tree-sitter + make -j$(nproc) + +Grammar .so modules are loaded at runtime via g_module_open(). The mc +binary stays small regardless of how many grammars are installed. + +To build without tree-sitter (default): + + ./configure + make -j$(nproc) + +When --with-tree-sitter is not used, no tree-sitter headers or libraries +are required and the tree-sitter code is excluded via #ifdef +HAVE_TREE_SITTER guards. + + +Installing grammars +------------------- + +Grammars are installed separately from MC using the mc-ts-grammar tool +from the mc-ts-grammars repository: + + # Install all available grammars from the latest release + mc-ts-grammar install --all + + # Install specific grammars + mc-ts-grammar install python bash yaml markdown + + # Build grammars from source (requires tree-sitter CLI + C compiler) + git clone https://github.com/jtyr/mc-ts-grammars + cd mc-ts-grammars + mc-ts-grammar build --install + +Grammars are installed into per-grammar directories: + + ~/.local/share/mc/syntax-ts//config.ini + ~/.local/share/mc/syntax-ts//highlights.scm + ~/.local/share/mc/syntax-ts//injections.scm (if applicable) + ~/.local/lib/mc/ts-grammars/.so + +Distros can install grammars into system paths: + + /usr/share/mc/syntax-ts//config.ini + /usr/share/mc/syntax-ts//highlights.scm + $(libdir)/mc/ts-grammars/.so + + +How it works +------------ + +The integration point is edit_get_syntax_color() in src/editor/syntax.c. +The renderer in editdraw.c calls this function for each byte position to +get a color pair. When tree-sitter is active the following happens: + +1. Initialization (edit_load_syntax / ts_init_for_file): + - The grammar registry is loaded on first use by scanning all + /share/mc/syntax-ts/*/config.ini files (user-local first, + then system). This builds lookup tables for extensions, filenames, + shebangs, display names, symbols, wrappers, and colors. + - The file is matched against a grammar using these lookup tables. + Precedence: filenames > shebangs > extensions. + - The grammar .so module is loaded via g_module_open() from + ~/.local/lib/mc/ts-grammars/ or $(libdir)/mc/ts-grammars/. + - A TSParser is created for the grammar. + - The highlight query file (/highlights.scm) is loaded and + compiled with ts_query_new(). + - If an injections.scm query file exists for this grammar, injection + parsers and queries are initialized. + - If any step fails, ts_active is set to FALSE and the legacy system + takes over transparently. + +2. Parsing: + - The TSParser uses a TSInput callback (ts_input_read) that reads + directly from MC's edit buffer (edit_buffer_t). No copy of the + file contents is made. + - The initial parse produces a TSTree. + +3. Incremental re-parsing: + - When the buffer is modified (insert, delete, backspace), the + function edit_syntax_ts_notify_edit() is called from edit.c. + - It calls ts_tree_edit() with the byte range of the change. The + actual re-parse is deferred to the next highlight cache rebuild. + +4. Highlight cache (ts_rebuild_highlight_cache): + - A TSQueryCursor is restricted to a byte range around the viewport + (viewport +/- 8 KB). + - Query matches produce (start_byte, end_byte, color) entries stored + in a GArray cache (ts_highlights). + - If injection is active, injection ranges are collected from the + primary tree and injection queries are run on those ranges. + - For dynamic injection, fenced code blocks are detected, the + language name is read from the buffer, and the content is parsed + with the matching grammar (with per-language caching). + - The cache is rebuilt when the viewport scrolls outside the cached + range or the tree needs re-parsing. + +5. Color lookup (ts_get_color_at): + - For each byte, a linear scan in the highlight cache returns the + matching color (last match wins, so innermost captures take + precedence). + + +Language injection +------------------ + +Language injection allows one grammar to delegate parsing of specific +AST nodes to another grammar. Injections are configured via standard +tree-sitter injections.scm query files in the per-grammar directory. + +An injections.scm file uses tree-sitter query syntax with special +capture names and predicates: + + @injection.content -- the node whose text is parsed by the child grammar + @injection.language -- a node whose text names the child grammar + #set! injection.language "" -- a fixed child grammar name + +For example, Markdown injects inline elements and fenced code blocks: + + ((inline) @injection.content + (#set! injection.language "markdown_inline")) + + (fenced_code_block + (info_string (language) @injection.language) + (code_fence_content) @injection.content) + +Injections depend on other grammars being installed. If the injected +grammar is not installed, the injection is silently skipped and the text +stays in the parent language's default color. + +Injections are recursive up to 3 levels deep. For example, a Go +template file wrapping Markdown can highlight fenced code blocks: +gotmpl -> markdown -> python (3 levels). + + +Wrapper grammars +---------------- + +Wrapper grammars are template languages (like Go templates) that wrap +a host language. Content outside the template syntax lives in specific +AST nodes and can be highlighted by injecting the host grammar. + +Wrapper grammars are configured in the per-grammar config.ini: + + [grammar] + wrapper=text yaml json toml html xml markdown css + +The first token is the AST content node name, the remaining tokens are +host grammar names that this wrapper can wrap. + +This enables two features: + +1. ERROR fallback: when a host grammar produces a catastrophic parse + failure (ERROR root node), each wrapper that lists that host is + tried as an alternative. If the wrapper parses successfully, the + host grammar is injected into the wrapper's content nodes. + +2. Compound extensions: for files like README.md.gotmpl, the inner + extension (.md) identifies the host grammar, which is injected + into the wrapper's content nodes automatically. + + +Parse error highlighting +------------------------ + +When a tree-sitter grammar produces ERROR nodes (parse failures), +the affected regions are highlighted in red. This provides a visual +indication that the parser could not understand parts of the file. + +Valid captures within ERROR regions take precedence over the red +error coloring via the "narrower wins" rule. + + +Syntax highlighting modes +------------------------- + +When compiled with tree-sitter support and Tree-sitter highlighting +is enabled (Options > General > Tree-sitter highlighting), the editor +supports three modes: + +- Tree-sitter (TS): AST-based highlighting using tree-sitter grammars. +- Legacy: Regex-based highlighting using .syntax files. +- None: Syntax highlighting disabled. + +The active mode is shown in the status bar as S:[TS], S:[Legacy], or +S:[None]. When Tree-sitter highlighting is disabled in settings, the +S:[...] indicator is hidden (original look). + +Ctrl+S cycles forward through modes: TS -> Legacy -> None -> TS. +When Tree-sitter highlighting is disabled, Ctrl+S cycles Legacy <-> None +only. + +Ctrl+T toggles directly between TS and Legacy (skips None). This is +useful for quickly comparing tree-sitter and legacy highlighting. +When Tree-sitter highlighting is disabled, Ctrl+T does nothing. + +The --no-tree-sitter command-line flag overrides the persistent setting +and disables tree-sitter for the entire session. + +The Tree-sitter highlighting setting is saved in ~/.config/mc/ini as +editor_use_tree_sitter under the [Midnight-Commander] section. + + +File layout +----------- + +Source files (src/editor/): + + syntax_ts.c Tree-sitter highlighting implementation. Contains: + - Grammar registry scanner (scans per-grammar + config.ini files on first use) + - ts_find_grammar() -- matches filename via registry + lookup tables (filenames > shebangs > extensions) + - ts_capture_name_to_color() -- color mapping with + longest-prefix matching + - ts_find_wrapper_for_host() -- finds a wrapper + grammar for a failed host grammar + - ts_setup_wrapper_injection() -- builds injection + query for wrapper grammars + - ts_load_query_file() -- loads .scm file from + per-grammar directory + - ts_init_injections() -- sets up injection parsers + - ts_get_dynamic_lang() -- lazy-loads dynamic grammar + - ts_inject_and_highlight() -- parses and highlights + an injected language, with recursive injection + support (up to 3 levels) + - ts_init_for_file() -- initialization + - ts_free() -- cleanup + - ts_rebuild_highlight_cache() -- query cursor, + injection processing, error highlighting + - ts_get_color_at() -- linear scan + + syntax_ts.h Public API: ts_init_for_file(), ts_free(), + ts_get_color_at(), ts_rebuild_highlight_cache(), + ts_config_reverse_lookup(), + ts_load_grammar_registry(). + + syntax.c Integration points (inside #ifdef HAVE_TREE_SITTER): + - edit_load_syntax() calls ts_init_for_file() + - edit_free_syntax_rules() calls ts_free() + - edit_syntax_ts_notify_edit() -- incremental edit + + ts-grammar-loader.h + Grammar loader using g_module_open() to load .so + modules on demand. Caches loaded modules. Reads + symbol overrides from config.ini via + ts_get_symbol_override(). + + editdraw.c Status bar rendering includes S:[TS]/S:[Legacy]/ + S:[None] when Tree-sitter highlighting is enabled. + + editwidget.h WEdit struct extended with tree-sitter fields. + + editcmd.c Mode cycling (Ctrl+S, Ctrl+T) respects the + use_tree_sitter setting. + + edit.h edit_options_t includes use_tree_sitter (persistent), + syntax_highlight_mode, and ts_available (runtime). + +Installed per-grammar data (from mc-ts-grammars): + + /share/mc/syntax-ts// + config.ini Grammar metadata and color mappings (INI format). + highlights.scm MC-curated highlight query file. + injections.scm Injection query file (optional). + + /mc/ts-grammars/ + .so Shared grammar module. Each exports a single + tree_sitter_() function. + + The loader checks two directories in order: + 1. ~/.local/lib/mc/ts-grammars/ (user-local, checked first) + 2. $(libdir)/mc/ts-grammars/ (system, set at configure time) + +Build system files: + + m4.include/mc-with-tree-sitter.m4 + --with-tree-sitter flag. Checks for libtree-sitter + and gmodule-2.0. Sets TREE_SITTER_GRAMMAR_LIBDIR. + + src/editor/Makefile.am + Links $(GMODULE_LIBS) and defines TS_GRAMMAR_LIBDIR. + + +Per-grammar configuration (config.ini) +-------------------------------------- + +Each installed grammar has a config.ini file in INI format (parseable by +GKeyFile). It contains all metadata MC needs to register and render the +grammar: + + [grammar] + extensions=.py .pyw .pyi + filenames=SConstruct SConscript + shebangs=python python3 + display-name=Python Program + symbol=python + wrapper=text yaml json toml html xml markdown css + + [colors] + comment=brown; + keyword=yellow; + string=green; + variable.builtin=brightred; + +The [grammar] section holds: + + extensions Space-separated file extensions (must start with .). + filenames Space-separated exact basenames. + shebangs Space-separated interpreter names. + display-name Human-readable name for the status bar. + symbol Override for tree_sitter_() function + (only if different from grammar directory name). + wrapper Content node + space-separated host grammar names + (only for wrapper/template grammars). + +The [colors] section maps capture names to MC terminal colors: + + = ; + + Background can be omitted (inherits default). Each grammar has its + own self-contained color scheme. + +Lookup precedence when opening a file: +1. filenames -- exact basename match (highest priority) +2. shebangs -- interpreter from first line +3. extensions -- file extension match (lowest priority) + +At startup, MC scans per-grammar directories from: +1. ~/.local/share/mc/syntax-ts/*/config.ini (user-local, wins) +2. $(datadir)/mc/syntax-ts/*/config.ini (system) + +User-local grammars fully replace system grammars of the same name. + + +Highlight query files +--------------------- + +Each grammar has a highlight query file named highlights.scm in its +per-grammar directory. Query files are searched in this order: + +1. User: ~/.local/share/mc/syntax-ts//highlights.scm +2. System: $(datadir)/mc/syntax-ts//highlights.scm + +All query files are MC-specific, tailored to MC's terminal color scheme. +Upstream query files from grammar repositories are not used directly. + +Query files use tree-sitter query syntax (S-expressions) to match AST +node patterns and assign capture names: + + (function_definition name: (identifier) @function) + (string_literal) @string + (comment) @comment + ["if" "else" "return"] @keyword + +Critical constraint: every node name and anonymous string literal in the +query file MUST exist in the grammar's symbol table. If any name is +invalid, ts_query_new() returns NULL and the language falls back to +legacy highlighting. + +Query files support hierarchical capture names (e.g. @keyword.control, +@string.special). The color lookup performs longest-prefix matching. + + +Color mapping +------------- + +Colors are defined in the [colors] section of each grammar's config.ini. +Common capture names and their typical colors: + + Capture Foreground Color Purpose + ------- ---------------- ------- + keyword yellow Language keywords + keyword.other white Secondary keywords + keyword.control brightmagenta Control flow (PHP) + keyword.directive magenta Directives (Make) + function brightcyan Function names + function.special brightred Preprocessor macros + function.builtin brown Built-in functions (Go) + string green String literals + string.special brightgreen Char literals, escapes + comment brown Comments + constant lightgray Constants + variable.builtin brightred self, $vars + operator brightcyan Operators + delimiter brightcyan Brackets, punctuation + tag brightcyan HTML/XML tags + +Available colors: black, red, green, brown, blue, magenta, cyan, +lightgray, gray, brightred, brightgreen, yellow, brightblue, +brightmagenta, brightcyan, white. + + +Adding a new language +--------------------- + +To add tree-sitter highlighting for a new language: + +1. Ensure the grammar is in the mc-ts-grammars repository's + grammars.yaml registry. + +2. Create grammars//highlights.scm with tree-sitter query + patterns tailored to MC's color scheme. + +3. Create grammars//config.ini with file matching rules and + colors. + +4. Set release: true in the grammar's grammars.yaml entry. + +5. Submit a pull request to the mc-ts-grammars repository. + +For local testing without a PR: + + # Install the grammar .so and config manually + mkdir -p ~/.local/share/mc/syntax-ts/ + cp config.ini highlights.scm ~/.local/share/mc/syntax-ts// + cp .so ~/.local/lib/mc/ts-grammars/ + + +Removing or disabling a language +-------------------------------- + +Grammars are managed externally by the mc-ts-grammar installer. + + # Remove a grammar + mc-ts-grammar uninstall + + # Remove all grammars + mc-ts-grammar uninstall --all + +To disable tree-sitter entirely, uncheck "Tree-sitter highlighting" in +Options > General, or pass --no-tree-sitter on the command line. + + +Updating grammars +----------------- + + # Update all installed grammars to the latest release + mc-ts-grammar update --all + + # Update specific grammars + mc-ts-grammar update python bash yaml + + +Validating query files +---------------------- + +A query file must only reference node names that exist in the grammar. +If any node name is invalid, tree-sitter silently rejects the entire +query and the language falls back to legacy highlighting. + +The mc-ts-grammars repository includes a validation step in CI that +compiles all query files against their grammars using ts_query_new(). + +For local validation: + + cd mc-ts-grammars + .github/actions/validate-grammar/scripts/validate.sh --build-dir=build --all + + +Testing with mc-syntax-dump +--------------------------- + +The mc-syntax-dump tool (tests/syntax/) dumps syntax highlighting output +as ANSI-colored text for comparing tree-sitter and legacy rendering. + +Build the tool from the MC source tree: + + cd tests/syntax + make + +Compare tree-sitter and legacy output: + + # TS highlighting using queries from the grammars repo + mc-syntax-dump --ts \ + --grammar-dir /grammars/python \ + --lib-dir /build \ + example.py + + # Legacy highlighting + mc-syntax-dump --legacy example.py + +The --grammar-dir option points to a per-grammar directory containing +highlights.scm and config.ini. The --lib-dir option points to the +directory with built .so files. This allows testing queries directly +from a development checkout without installing them. + +Each grammar in mc-ts-grammars includes a report.md with a comparison +of tree-sitter vs legacy highlighting quality. + + +Troubleshooting +--------------- + +Tree-sitter highlighting does not appear: + + The most common cause is a query compilation failure. If any node + name or anonymous literal in the .scm file does not exist in the + grammar, the entire query is silently rejected and the editor falls + back to legacy highlighting. Validate the query file as described + above. + + Other possible causes: + - Tree-sitter highlighting is disabled in Options > General. + - The grammar .so module is not installed in + ~/.local/lib/mc/ts-grammars/ or $(libdir)/mc/ts-grammars/. + - The filename does not match any entry in any grammar's config.ini. + - The grammar symbol name does not match the function exported by + the .so module. + +Colors look wrong: + + Check that the capture names in the query file match entries in the + [colors] section of config.ini. The lookup uses longest-prefix + matching: @keyword.control is matched before @keyword. If no + prefix matches, the default editor color is used. + +A grammar causes high CPU usage or hangs: + + Some grammars may be slow on very large files. If a grammar + consistently causes problems, uninstall it or switch to legacy + highlighting with Ctrl+T. + + +Limitations +----------- + +- Query files are all-or-nothing: if any node name is invalid, the + entire query is rejected. There is no partial fallback within a + single query file. + +- The highlight cache covers the viewport plus 8 KB. Very long lines + or extreme scroll speeds may cause brief flicker as the cache is + rebuilt. + +- Dynamic injection relies on the language name in the source matching + a registered grammar name exactly. Common aliases (e.g. "py" for + "python") are not resolved automatically. + +- During rapid typing (e.g. holding spacebar or backspace), the screen + may not refresh until the key is released. MC's renderer is + single-threaded and tree-sitter work runs synchronously. Injection + processing is skipped during rapid edits and refreshed once input + settles. diff --git a/lib/fileloc.h b/lib/fileloc.h index 3e3fb34f64..e39cc37f49 100644 --- a/lib/fileloc.h +++ b/lib/fileloc.h @@ -85,6 +85,7 @@ #define EDIT_HOME_TEMP_FILE EDIT_HOME_DIR PATH_SEP_STR "mcedit.temp" #define EDIT_SYNTAX_DIR "syntax" #define EDIT_SYNTAX_FILE EDIT_SYNTAX_DIR PATH_SEP_STR "Syntax" +#define EDIT_SYNTAX_TS_DIR "syntax-ts" #define EDIT_GLOBAL_MENU "mcedit.menu" #define EDIT_LOCAL_MENU ".cedit.menu" diff --git a/lib/keybind.c b/lib/keybind.c index 194f3ecbc7..58aaa83036 100644 --- a/lib/keybind.c +++ b/lib/keybind.c @@ -330,6 +330,7 @@ static name_keymap_t command_names[] = { ADD_KEYMAP_NAME (InsertLiteral), ADD_KEYMAP_NAME (ShowTabTws), ADD_KEYMAP_NAME (SyntaxOnOff), + ADD_KEYMAP_NAME (SyntaxToggleTS), ADD_KEYMAP_NAME (SyntaxChoose), ADD_KEYMAP_NAME (ShowMargin), ADD_KEYMAP_NAME (OptionsSaveMode), diff --git a/lib/keybind.h b/lib/keybind.h index 28be23559e..db3a567066 100644 --- a/lib/keybind.h +++ b/lib/keybind.h @@ -309,6 +309,7 @@ enum CK_ShowMargin, CK_ShowTabTws, CK_SyntaxOnOff, + CK_SyntaxToggleTS, CK_SyntaxChoose, CK_InsertLiteral, CK_ExternalCommand, diff --git a/m4.include/mc-with-tree-sitter.m4 b/m4.include/mc-with-tree-sitter.m4 new file mode 100644 index 0000000000..bdfc38adc3 --- /dev/null +++ b/m4.include/mc-with-tree-sitter.m4 @@ -0,0 +1,39 @@ +dnl +dnl Tree-sitter syntax highlighting support. +dnl + +AC_DEFUN([mc_WITH_TREE_SITTER], [ + + AC_ARG_WITH([tree-sitter], + AS_HELP_STRING([--with-tree-sitter], + [Enable tree-sitter syntax highlighting (in addition to legacy MC highlighting)]), + [with_tree_sitter=$withval], + [with_tree_sitter=no]) + + if test x"$with_tree_sitter" = xyes; then + AC_CHECK_HEADER([tree_sitter/api.h], [], + [AC_MSG_ERROR([tree-sitter headers not found (required for --with-tree-sitter)])]) + AC_CHECK_LIB([tree-sitter], [ts_parser_new], [], + [AC_MSG_ERROR([tree-sitter library not found (required for --with-tree-sitter)])]) + AC_DEFINE([HAVE_TREE_SITTER], [1], [Define if tree-sitter syntax highlighting is enabled]) + + AC_DEFINE([TREE_SITTER_SHARED], [1], [Define if tree-sitter grammars are loaded as shared modules]) + PKG_CHECK_MODULES([GMODULE], [gmodule-2.0], [], + [AC_MSG_ERROR([gmodule-2.0 required for tree-sitter grammar loading])]) + + dnl Resolve libdir for use in Makefile + eval "ts_libdir=\"$libdir\"" + TREE_SITTER_GRAMMAR_LIBDIR="${ts_libdir}/mc/ts-grammars" + + TREE_SITTER_LIBS="-ltree-sitter" + TREE_SITTER_CFLAGS="" + + AC_SUBST([TREE_SITTER_GRAMMAR_LIBDIR]) + AC_SUBST([TREE_SITTER_LIBS]) + AC_SUBST([TREE_SITTER_CFLAGS]) + + AC_MSG_NOTICE([tree-sitter: enabled (shared) -- grammar .so files loaded at runtime]) + else + AC_MSG_NOTICE([tree-sitter syntax highlighting disabled]) + fi +]) diff --git a/misc/Makefile.am b/misc/Makefile.am index b02fcc5b35..9e062526de 100644 --- a/misc/Makefile.am +++ b/misc/Makefile.am @@ -1,6 +1,6 @@ ## Process this file with automake to create Makefile.in. -SUBDIRS = ext.d macros.d skins syntax +SUBDIRS = ext.d macros.d skins syntax syntax-ts LIBFILES_OUT = mc.ext.ini diff --git a/misc/mc.default.keymap b/misc/mc.default.keymap index 7e107805e6..41dded2b79 100644 --- a/misc/mc.default.keymap +++ b/misc/mc.default.keymap @@ -369,6 +369,7 @@ MacroStartStopRecord = ctrl-r ShowNumbers = alt-n ShowTabTws = alt-underline SyntaxOnOff = ctrl-s +SyntaxToggleTS = ctrl-t # SyntaxChoose = # ShowMargin = Find = alt-enter diff --git a/misc/mc.emacs.keymap b/misc/mc.emacs.keymap index b03fc8e68c..0658400fbf 100644 --- a/misc/mc.emacs.keymap +++ b/misc/mc.emacs.keymap @@ -368,6 +368,7 @@ MacroStartStopRecord = ctrl-r ShowNumbers = alt-n ShowTabTws = alt-underline SyntaxOnOff = ctrl-s +SyntaxToggleTS = ctrl-t # SyntaxChoose = # ShowMargin = Find = alt-enter diff --git a/misc/syntax-ts/Makefile.am b/misc/syntax-ts/Makefile.am new file mode 100644 index 0000000000..f401ffa10f --- /dev/null +++ b/misc/syntax-ts/Makefile.am @@ -0,0 +1,5 @@ +# Per-grammar config files and query files are installed by mc-ts-grammar. +# This directory is intentionally empty in the source tree. +if USE_INTERNAL_EDIT +syntaxtsdir = $(pkgdatadir)/syntax-ts +endif diff --git a/src/Makefile.am b/src/Makefile.am index dbad42ad60..dfd15456ce 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -56,7 +56,8 @@ libinternal_la_LIBADD = \ $(DIFFLIB) $(EDITLIB) $(SUBSHELLLIB) mc_LDADD = \ - libinternal.la + libinternal.la \ + $(TREE_SITTER_LIBS) if ENABLE_MCLIB libinternal_la_LIBADD += \ diff --git a/src/args.c b/src/args.c index a72fab5de6..68c91f26b8 100644 --- a/src/args.c +++ b/src/args.c @@ -57,6 +57,11 @@ gboolean mc_args__force_colors = FALSE; /* Don't load keymap from file and use default one */ gboolean mc_args__nokeymap = FALSE; +#ifdef HAVE_TREE_SITTER +/* Disable tree-sitter highlighting, use legacy syntax engine */ +gboolean mc_args__no_tree_sitter = FALSE; +#endif + char *mc_args__last_wd_file = NULL; /* when enabled NETCODE, use following file as logfile */ @@ -211,6 +216,18 @@ static const GOptionEntry argument_main_table[] = { N_ (" ..."), }, +#ifdef HAVE_TREE_SITTER + { + "no-tree-sitter", + '\0', + G_OPTION_FLAG_IN_MAIN, + G_OPTION_ARG_NONE, + &mc_args__no_tree_sitter, + N_ ("Disable tree-sitter syntax highlighting, use legacy engine"), + NULL, + }, +#endif + G_OPTION_ENTRY_NULL, }; @@ -746,6 +763,11 @@ mc_setup_by_args (int argc, char **argv, GError **mcerror) mc_global.tty.use_subshell = FALSE; #endif +#ifdef HAVE_TREE_SITTER + if (mc_args__no_tree_sitter) + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_LEGACY; +#endif + #ifdef ENABLE_VFS_FTP if (mc_args__netfs_logfile != NULL) { diff --git a/src/args.h b/src/args.h index db7c2c018e..e3639b9df3 100644 --- a/src/args.h +++ b/src/args.h @@ -16,6 +16,9 @@ extern gboolean mc_args__force_xterm; extern gboolean mc_args__nomouse; extern gboolean mc_args__force_colors; extern gboolean mc_args__nokeymap; +#ifdef HAVE_TREE_SITTER +extern gboolean mc_args__no_tree_sitter; +#endif extern char *mc_args__last_wd_file; extern char *mc_args__netfs_logfile; extern char *mc_args__keymap_file; diff --git a/src/editor/Makefile.am b/src/editor/Makefile.am index 304cb35be9..775c1c8fe4 100644 --- a/src/editor/Makefile.am +++ b/src/editor/Makefile.am @@ -1,5 +1,3 @@ -EXTRA_DIST = - if USE_INTERNAL_EDIT noinst_LTLIBRARIES = libedit.la else @@ -21,7 +19,9 @@ libedit_la_SOURCES = \ editwidget.c editwidget.h \ etags.c etags.h \ format.c \ - syntax.c + syntax.c \ + syntax_ts.c syntax_ts.h \ + ts-grammar-loader.h if USE_ASPELL if HAVE_GMODULE @@ -30,4 +30,11 @@ libedit_la_SOURCES += \ endif endif -AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir) +AM_CPPFLAGS = $(GLIB_CFLAGS) $(TREE_SITTER_CFLAGS) -I$(top_srcdir) + +if USE_TREE_SITTER +AM_CPPFLAGS += $(GMODULE_CFLAGS) -DTS_GRAMMAR_LIBDIR=\""$(libdir)/mc/ts-grammars"\" +libedit_la_LIBADD = $(TREE_SITTER_LIBS) $(GMODULE_LIBS) +else +libedit_la_LIBADD = +endif diff --git a/src/editor/edit-impl.h b/src/editor/edit-impl.h index d8662b8738..f264de5a0f 100644 --- a/src/editor/edit-impl.h +++ b/src/editor/edit-impl.h @@ -181,6 +181,7 @@ gboolean edit_load_forward_cmd (WEdit *edit); void edit_block_process_cmd (WEdit *edit, int macro_number); void edit_refresh_cmd (void); void edit_syntax_onoff_cmd (WDialog *h); +void edit_syntax_toggle_ts_cmd (WDialog *h); void edit_show_tabs_tws_cmd (WDialog *h); void edit_show_margin_cmd (WDialog *h); void edit_show_numbers_cmd (WDialog *h); @@ -215,6 +216,11 @@ void edit_free_syntax_rules (WEdit *edit); MC_MOCKABLE int edit_get_syntax_color (WEdit *edit, off_t byte_index); void edit_syntax_dialog (WEdit *edit); +#ifdef HAVE_TREE_SITTER +void edit_syntax_ts_notify_edit (WEdit *edit, off_t start_byte, off_t old_end_byte, + off_t new_end_byte); +#endif + void book_mark_insert (WEdit *edit, long line, int c); gboolean book_mark_query_color (WEdit *edit, long line, int c); struct edit_book_mark_t *book_mark_find (WEdit *edit, long line); diff --git a/src/editor/edit.c b/src/editor/edit.c index 360c347476..fbb8d281b9 100644 --- a/src/editor/edit.c +++ b/src/editor/edit.c @@ -93,6 +93,11 @@ edit_options_t edit_options = { .confirm_save = TRUE, .save_position = TRUE, .syntax_highlighting = TRUE, +#ifdef HAVE_TREE_SITTER + .use_tree_sitter = TRUE, + .syntax_highlight_mode = SYNTAX_HIGHLIGHT_TS, + .ts_available = TRUE, +#endif .group_undo = FALSE, .backup_ext = NULL, .filesize_threshold = NULL, @@ -2563,6 +2568,12 @@ edit_insert (WEdit *edit, int c) edit->last_get_rule += (edit->last_get_rule > edit->buffer.curs1) ? 1 : 0; edit_buffer_insert (&edit->buffer, c); + +#ifdef HAVE_TREE_SITTER + // Notify tree-sitter: one byte inserted at cursor position + edit_syntax_ts_notify_edit (edit, edit->buffer.curs1 - 1, edit->buffer.curs1 - 1, + edit->buffer.curs1); +#endif } /* --------------------------------------------------------------------------------------------- */ @@ -2595,6 +2606,12 @@ edit_insert_ahead (WEdit *edit, int c) edit->last_get_rule += (edit->last_get_rule >= edit->buffer.curs1) ? 1 : 0; edit_buffer_insert_ahead (&edit->buffer, c); + +#ifdef HAVE_TREE_SITTER + // Notify tree-sitter: one byte inserted ahead at cursor position + edit_syntax_ts_notify_edit (edit, edit->buffer.curs1, edit->buffer.curs1, + edit->buffer.curs1 + 1); +#endif } /* --------------------------------------------------------------------------------------------- */ @@ -2648,6 +2665,12 @@ edit_delete (WEdit *edit, gboolean byte_delete) p = edit_buffer_delete (&edit->buffer); edit_push_undo_action (edit, p + 256); + +#ifdef HAVE_TREE_SITTER + // Notify tree-sitter: one byte deleted at cursor position + edit_syntax_ts_notify_edit (edit, edit->buffer.curs1, edit->buffer.curs1 + 1, + edit->buffer.curs1); +#endif } edit_modification (edit); @@ -2704,6 +2727,12 @@ edit_backspace (WEdit *edit, gboolean byte_delete) p = edit_buffer_backspace (&edit->buffer); edit_push_undo_action (edit, p); + +#ifdef HAVE_TREE_SITTER + // Notify tree-sitter: one byte deleted before cursor position + edit_syntax_ts_notify_edit (edit, edit->buffer.curs1, edit->buffer.curs1 + 1, + edit->buffer.curs1); +#endif } edit_modification (edit); if (p == '\n') diff --git a/src/editor/edit.h b/src/editor/edit.h index 0ff8b8447d..803d182443 100644 --- a/src/editor/edit.h +++ b/src/editor/edit.h @@ -25,6 +25,16 @@ /*** enums ***************************************************************************************/ +#ifdef HAVE_TREE_SITTER +/* Syntax highlighting mode for Ctrl+S cycling */ +typedef enum +{ + SYNTAX_HIGHLIGHT_TS, /* tree-sitter (default when available) */ + SYNTAX_HIGHLIGHT_LEGACY, /* legacy regex-based */ + SYNTAX_HIGHLIGHT_NONE /* highlighting disabled */ +} syntax_highlight_mode_t; +#endif + /*** structures declarations (and typedefs of structures)*****************************************/ /* Editor widget */ @@ -51,6 +61,11 @@ typedef struct gboolean confirm_save; // queries on a save gboolean save_position; gboolean syntax_highlighting; +#ifdef HAVE_TREE_SITTER + gboolean use_tree_sitter; /* persistent: prefer TS highlighting */ + syntax_highlight_mode_t syntax_highlight_mode; + gboolean ts_available; /* runtime: FALSE if TS init failed for current file */ +#endif gboolean group_undo; char *backup_ext; char *filesize_threshold; diff --git a/src/editor/editcmd.c b/src/editor/editcmd.c index a1cc34e877..ce75a0ef71 100644 --- a/src/editor/editcmd.c +++ b/src/editor/editcmd.c @@ -57,6 +57,7 @@ #include "lib/event.h" // mc_event_raise() #include "lib/charsets.h" +#include "src/args.h" // mc_args__no_tree_sitter #include "src/history.h" #include "src/file_history.h" // show_file_history() #include "src/selcodepage.h" @@ -846,11 +847,67 @@ edit_refresh_cmd (void) void edit_syntax_onoff_cmd (WDialog *h) { +#ifdef HAVE_TREE_SITTER + /* Cycle through available modes. + If TS enabled: TS -> Legacy -> None -> TS + If TS disabled: Legacy -> None -> Legacy */ + if (!edit_options.use_tree_sitter || mc_args__no_tree_sitter) + { + /* No TS: just toggle Legacy <-> None */ + edit_options.syntax_highlighting = !edit_options.syntax_highlighting; + edit_options.syntax_highlight_mode = edit_options.syntax_highlighting + ? SYNTAX_HIGHLIGHT_LEGACY : SYNTAX_HIGHLIGHT_NONE; + } + else + { + switch (edit_options.syntax_highlight_mode) + { + case SYNTAX_HIGHLIGHT_TS: + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_LEGACY; + edit_options.syntax_highlighting = TRUE; + break; + case SYNTAX_HIGHLIGHT_LEGACY: + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_NONE; + edit_options.syntax_highlighting = FALSE; + break; + case SYNTAX_HIGHLIGHT_NONE: + default: + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_TS; + edit_options.syntax_highlighting = TRUE; + break; + } + } +#else edit_options.syntax_highlighting = !edit_options.syntax_highlighting; +#endif g_list_foreach (GROUP (h)->widgets, edit_syntax_onoff_cb, NULL); widget_draw (WIDGET (h)); } +/* --------------------------------------------------------------------------------------------- */ + +void +edit_syntax_toggle_ts_cmd (WDialog *h) +{ +#ifdef HAVE_TREE_SITTER + /* Toggle between TS and Legacy (skip None). + Do nothing if TS is disabled. */ + if (!edit_options.use_tree_sitter || mc_args__no_tree_sitter) + return; + + if (edit_options.syntax_highlight_mode == SYNTAX_HIGHLIGHT_TS) + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_LEGACY; + else + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_TS; + + edit_options.syntax_highlighting = TRUE; + g_list_foreach (GROUP (h)->widgets, edit_syntax_onoff_cb, NULL); + widget_draw (WIDGET (h)); +#else + (void) h; +#endif +} + /* --------------------------------------------------------------------------------------------- */ /** * Toggle tabs showing in all editor windows. diff --git a/src/editor/editdraw.c b/src/editor/editdraw.c index bde7e18047..b661499977 100644 --- a/src/editor/editdraw.c +++ b/src/editor/editdraw.c @@ -148,40 +148,108 @@ static inline void status_string (WEdit *edit, char *s, int w) { char *character_code; +#ifdef HAVE_TREE_SITTER + const char *syntax_mode_label; +#endif character_code = format_character_code (edit); +#ifdef HAVE_TREE_SITTER + if (edit_options.use_tree_sitter) + { + switch (edit_options.syntax_highlight_mode) + { + case SYNTAX_HIGHLIGHT_TS: + syntax_mode_label = edit->ts.active ? "TS" : "Legacy"; + break; + case SYNTAX_HIGHLIGHT_LEGACY: + syntax_mode_label = "Legacy"; + break; + case SYNTAX_HIGHLIGHT_NONE: + default: + syntax_mode_label = "None"; + break; + } + } +#endif + // The field lengths just prevent the status line from shortening too much if (edit_options.simple_statusbar) - g_snprintf (s, w, "%c%c%c%c %3ld %5ld/%ld %6ld/%ld [%s] %s", - edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', // - edit->modified != 0 ? 'M' : '-', // - macro_index < 0 ? '-' : 'R', // - edit->overwrite == 0 ? '-' : 'O', // - edit->curs_col + edit->over_col, // - edit->buffer.curs_line + 1, // - edit->buffer.lines + 1, // - (long) edit->buffer.curs1, // - (long) edit->buffer.size, // - character_code, - mc_global.source_codepage >= 0 ? get_codepage_id (mc_global.source_codepage) - : ""); + { +#ifdef HAVE_TREE_SITTER + if (edit_options.use_tree_sitter) + g_snprintf (s, w, + "%c%c%c%c %3ld %5ld/%ld %6ld/%ld [%s] S:[%s] %s", + edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', + edit->modified != 0 ? 'M' : '-', + macro_index < 0 ? '-' : 'R', + edit->overwrite == 0 ? '-' : 'O', + edit->curs_col + edit->over_col, + edit->buffer.curs_line + 1, + edit->buffer.lines + 1, + (long) edit->buffer.curs1, + (long) edit->buffer.size, + character_code, + syntax_mode_label, + mc_global.source_codepage >= 0 + ? get_codepage_id (mc_global.source_codepage) : ""); + else +#endif + g_snprintf (s, w, + "%c%c%c%c %3ld %5ld/%ld %6ld/%ld [%s] %s", + edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', + edit->modified != 0 ? 'M' : '-', + macro_index < 0 ? '-' : 'R', + edit->overwrite == 0 ? '-' : 'O', + edit->curs_col + edit->over_col, + edit->buffer.curs_line + 1, + edit->buffer.lines + 1, + (long) edit->buffer.curs1, + (long) edit->buffer.size, + character_code, + mc_global.source_codepage >= 0 + ? get_codepage_id (mc_global.source_codepage) : ""); + } else - g_snprintf (s, w, "[%c%c%c%c] %2ld L:[%3ld+%2ld %3ld/%3ld] *(%-4ld/%4ldb) [%s] %s", - edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', // - edit->modified != 0 ? 'M' : '-', // - macro_index < 0 ? '-' : 'R', // - edit->overwrite == 0 ? '-' : 'O', // - edit->curs_col + edit->over_col, // - edit->start_line + 1, // - edit->curs_row, // - edit->buffer.curs_line + 1, // - edit->buffer.lines + 1, // - (long) edit->buffer.curs1, // - (long) edit->buffer.size, // - character_code, - mc_global.source_codepage >= 0 ? get_codepage_id (mc_global.source_codepage) - : ""); + { +#ifdef HAVE_TREE_SITTER + if (edit_options.use_tree_sitter) + g_snprintf (s, w, + "[%c%c%c%c] %2ld L:[%3ld+%2ld %3ld/%3ld] *(%-4ld/%4ldb) [%s] S:[%s] %s", + edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', + edit->modified != 0 ? 'M' : '-', + macro_index < 0 ? '-' : 'R', + edit->overwrite == 0 ? '-' : 'O', + edit->curs_col + edit->over_col, + edit->start_line + 1, + edit->curs_row, + edit->buffer.curs_line + 1, + edit->buffer.lines + 1, + (long) edit->buffer.curs1, + (long) edit->buffer.size, + character_code, + syntax_mode_label, + mc_global.source_codepage >= 0 + ? get_codepage_id (mc_global.source_codepage) : ""); + else +#endif + g_snprintf (s, w, + "[%c%c%c%c] %2ld L:[%3ld+%2ld %3ld/%3ld] *(%-4ld/%4ldb) [%s] %s", + edit->mark1 != edit->mark2 ? (edit->column_highlight ? 'C' : 'B') : '-', + edit->modified != 0 ? 'M' : '-', + macro_index < 0 ? '-' : 'R', + edit->overwrite == 0 ? '-' : 'O', + edit->curs_col + edit->over_col, + edit->start_line + 1, + edit->curs_row, + edit->buffer.curs_line + 1, + edit->buffer.lines + 1, + (long) edit->buffer.curs1, + (long) edit->buffer.size, + character_code, + mc_global.source_codepage >= 0 + ? get_codepage_id (mc_global.source_codepage) : ""); + } g_free (character_code); } @@ -319,6 +387,34 @@ edit_status_window (WEdit *edit) tty_printf ("[%s]", character_code); g_free (character_code); } + +#ifdef HAVE_TREE_SITTER + /* Show syntax highlighting mode indicator */ + { + const char *mode_label; + + switch (edit_options.syntax_highlight_mode) + { + case SYNTAX_HIGHLIGHT_TS: + mode_label = edit->ts.active ? "TS" : "Legacy"; + break; + case SYNTAX_HIGHLIGHT_LEGACY: + mode_label = "Legacy"; + break; + case SYNTAX_HIGHLIGHT_NONE: + default: + mode_label = "None"; + break; + } + + tty_getyx (&y, &x); + x -= w->rect.x; + if (x + (int) strlen (mode_label) + 5 <= cols - 2) + { + tty_printf (" S:[%s]", mode_label); + } + } +#endif } /* --------------------------------------------------------------------------------------------- */ diff --git a/src/editor/editmenu.c b/src/editor/editmenu.c index 52b876fccf..6aa03f9972 100644 --- a/src/editor/editmenu.c +++ b/src/editor/editmenu.c @@ -154,6 +154,10 @@ create_command_menu (void) g_list_prepend (entries, menu_entry_new (_ ("Go to matching &bracket"), CK_MatchBracket)); entries = g_list_prepend (entries, menu_entry_new (_ ("Toggle s&yntax highlighting"), CK_SyntaxOnOff)); +#ifdef HAVE_TREE_SITTER + entries = g_list_prepend (entries, + menu_entry_new (_ ("Toggle &TS/legacy syntax"), CK_SyntaxToggleTS)); +#endif entries = g_list_prepend (entries, menu_entry_new (_ ("Togg&le right margin"), CK_ShowMargin)); entries = g_list_prepend (entries, menu_separator_new ()); entries = g_list_prepend (entries, menu_entry_new (_ ("&Find declaration"), CK_Find)); diff --git a/src/editor/editoptions.c b/src/editor/editoptions.c index f60cb40daa..aad457ffab 100644 --- a/src/editor/editoptions.c +++ b/src/editor/editoptions.c @@ -124,6 +124,9 @@ edit_options_dialog (WDialog *h) char *p, *q; int wrap_mode = 0; gboolean old_syntax_hl; +#ifdef HAVE_TREE_SITTER + gboolean old_use_ts; +#endif #ifdef ENABLE_NLS static gboolean i18n_flag = FALSE; @@ -145,6 +148,11 @@ edit_options_dialog (WDialog *h) else wrap_mode = 0; + old_syntax_hl = edit_options.syntax_highlighting; +#ifdef HAVE_TREE_SITTER + old_use_ts = edit_options.use_tree_sitter; +#endif + { quick_widget_t quick_widgets[] = { // clang-format off @@ -175,6 +183,10 @@ edit_options_dialog (WDialog *h) QUICK_CHECKBOX (_ ("Visible &tabs"), &edit_options.visible_tabs, NULL), QUICK_CHECKBOX (_ ("Synta&x highlighting"), &edit_options.syntax_highlighting, NULL), +#ifdef HAVE_TREE_SITTER + QUICK_CHECKBOX (_ ("Tree-&sitter highlighting"), + &edit_options.use_tree_sitter, NULL), +#endif QUICK_CHECKBOX (_ ("C&ursor after inserted block"), &edit_options.cursor_after_inserted_block, NULL), QUICK_CHECKBOX (_ ("Pers&istent selection"), @@ -206,8 +218,6 @@ edit_options_dialog (WDialog *h) return; } - old_syntax_hl = edit_options.syntax_highlighting; - if (!edit_options.cursor_beyond_eol) g_list_foreach (GROUP (h)->widgets, edit_reset_over_col, NULL); @@ -243,8 +253,12 @@ edit_options_dialog (WDialog *h) edit_options.typewriter_wrap = FALSE; } - // Load or unload syntax rules if the option has changed - if (edit_options.syntax_highlighting != old_syntax_hl) + // Load or unload syntax rules if any highlighting option changed + if (edit_options.syntax_highlighting != old_syntax_hl +#ifdef HAVE_TREE_SITTER + || edit_options.use_tree_sitter != old_use_ts +#endif + ) g_list_foreach (GROUP (h)->widgets, edit_reload_syntax, NULL); } diff --git a/src/editor/editwidget.c b/src/editor/editwidget.c index e905f0f447..6ab17e3e35 100644 --- a/src/editor/editwidget.c +++ b/src/editor/editwidget.c @@ -454,6 +454,9 @@ edit_dialog_command_execute (WDialog *h, long command) case CK_SyntaxOnOff: edit_syntax_onoff_cmd (h); break; + case CK_SyntaxToggleTS: + edit_syntax_toggle_ts_cmd (h); + break; case CK_ShowTabTws: edit_show_tabs_tws_cmd (h); break; diff --git a/src/editor/editwidget.h b/src/editor/editwidget.h index c8569cfb35..55093d6b14 100644 --- a/src/editor/editwidget.h +++ b/src/editor/editwidget.h @@ -142,7 +142,23 @@ struct WEdit unsigned int skip_detach_prompt : 1; // Do not prompt whether to detach a file anymore - // syntax highlighting + // syntax highlighting (tree-sitter) + struct + { + void *parser; // TSParser* + void *tree; // TSTree* - current parse tree + void *highlight_query; // TSQuery* - compiled highlight query + GArray *highlights; // array of ts_highlight_entry_t (start, end, color) + off_t highlights_start; // byte range start of cached highlights + off_t highlights_end; // byte range end of cached highlights + char *grammar_name; // grammar name (for per-grammar color lookup) + void *injection_query; // TSQuery* for injections.scm + GHashTable *injection_lang_cache; // lang name -> ts_dynamic_lang_t* + gboolean active; // TRUE if tree-sitter is being used + gboolean need_reparse; // TRUE if tree needs re-parsing before use + } ts; + + // syntax highlighting (legacy fallback) GSList *syntax_marker; GPtrArray *rules; off_t last_get_rule; diff --git a/src/editor/syntax.c b/src/editor/syntax.c index 7a8c8d339f..5def82d5ea 100644 --- a/src/editor/syntax.c +++ b/src/editor/syntax.c @@ -63,9 +63,11 @@ #include "lib/widget.h" // Listbox, message() #include "src/util.h" // file_error_message() +#include "src/args.h" // mc_args__no_tree_sitter #include "edit-impl.h" #include "editwidget.h" +#include "syntax_ts.h" /*** global variables ****************************************************************************/ @@ -149,12 +151,14 @@ typedef struct edit_syntax_rule_t rule; } syntax_marker_t; + /*** forward declarations (file scope functions) *************************************************/ /*** file scope variables ************************************************************************/ static char *error_file_name = NULL; + /* --------------------------------------------------------------------------------------------- */ /*** file scope functions ************************************************************************/ /* --------------------------------------------------------------------------------------------- */ @@ -685,7 +689,7 @@ translate_rule_to_color (const WEdit *edit, const edit_syntax_rule_t *rule) In case of an error, *line will not be modified. */ -static size_t +size_t read_one_line (char **line, FILE *f) { GString *p; @@ -832,7 +836,7 @@ get_args (char *l, char **args, int args_size) /* --------------------------------------------------------------------------------------------- */ -static int +int this_try_alloc_color_pair (tty_color_pair_t *color) { char f[80], b[80], a[80], *p; @@ -1416,7 +1420,7 @@ edit_read_syntax_file (WEdit *edit, GPtrArray *pnames, const char *syntax_file, /* --------------------------------------------------------------------------------------------- */ -static const char * +const char * get_first_editor_line (WEdit *edit) { static char s[256]; @@ -1477,6 +1481,7 @@ exec_edit_syntax_dialog (const GPtrArray *names, const char *current_syntax) return listbox_run (syntaxlist); } + /* --------------------------------------------------------------------------------------------- */ /*** public functions ****************************************************************************/ /* --------------------------------------------------------------------------------------------- */ @@ -1487,6 +1492,30 @@ edit_get_syntax_color (WEdit *edit, off_t byte_index) if (!tty_use_colors ()) return 0; +#ifdef HAVE_TREE_SITTER + if (edit_options.syntax_highlighting && edit->ts.active && byte_index < edit->buffer.size) + { + // Check if we need to rebuild the highlight cache + // Use a window of +/- 8K around the requested byte + if (edit->ts.highlights_start < 0 || byte_index < edit->ts.highlights_start + || byte_index >= edit->ts.highlights_end) + { + off_t range_start = byte_index - 8192; + off_t range_end = byte_index + 8192; + + if (range_start < 0) + range_start = 0; + if (range_end > edit->buffer.size) + range_end = edit->buffer.size; + + ts_rebuild_highlight_cache (edit, range_start, range_end); + } + + return ts_get_color_at (edit, byte_index); + } +#endif + + // Legacy fallback if (edit_options.syntax_highlighting && edit->rules != NULL && byte_index < edit->buffer.size) { edit_get_rule (edit, byte_index); @@ -1501,14 +1530,37 @@ edit_get_syntax_color (WEdit *edit, off_t byte_index) void edit_free_syntax_rules (WEdit *edit) { +#ifdef HAVE_TREE_SITTER + gboolean had_ts; +#endif + if (edit == NULL) return; +#ifdef HAVE_TREE_SITTER + had_ts = edit->ts.active; + ts_free (edit); +#endif + if (edit->defines != NULL) destroy_defines (&edit->defines); if (edit->rules == NULL) + { + MC_PTR_FREE (edit->syntax_type); +#ifdef HAVE_TREE_SITTER + /* Free temp color pairs even when no legacy rules exist. + TS color pairs are allocated as temporary and must be freed + before reloading to avoid stale pair indices. */ + if (had_ts) + tty_color_free_temp (); +#endif + /* Reset the rule scanner state so the next edit_get_rule() call + rescans from the beginning. Without this, last_get_rule may + match the requested byte_index and return stale rule state. */ + edit->last_get_rule = -1; return; + } edit_get_rule (edit, -1); MC_PTR_FREE (edit->syntax_type); @@ -1532,6 +1584,25 @@ edit_load_syntax (WEdit *edit, GPtrArray *pnames, const char *type) int r; char *f = NULL; +#ifdef HAVE_TREE_SITTER + /* Sync mode with syntax_highlighting boolean (which may have been + loaded from config). If highlighting is off, mode should be NONE. + If highlighting is on and mode is NONE (e.g. restored from config), + reset to TS (or LEGACY if TS is unavailable). */ + if (!edit_options.syntax_highlighting) + { + if (edit_options.syntax_highlight_mode != SYNTAX_HIGHLIGHT_NONE) + edit_options.syntax_highlight_mode = SYNTAX_HIGHLIGHT_NONE; + } + else if (edit_options.syntax_highlight_mode == SYNTAX_HIGHLIGHT_NONE) + { + edit_options.syntax_highlight_mode = + (edit_options.use_tree_sitter && !mc_args__no_tree_sitter) + ? SYNTAX_HIGHLIGHT_TS + : SYNTAX_HIGHLIGHT_LEGACY; + } +#endif + if (auto_syntax) type = NULL; @@ -1553,6 +1624,37 @@ edit_load_syntax (WEdit *edit, GPtrArray *pnames, const char *type) if (edit != NULL && edit->filename_vpath == NULL) return; +#ifdef HAVE_TREE_SITTER + // Try tree-sitter first (only for actual file loading, not name collection). + // Skip if --no-tree-sitter was passed or if user cycled to legacy mode. + if (edit != NULL && pnames == NULL + && !mc_args__no_tree_sitter + && edit_options.syntax_highlight_mode == SYNTAX_HIGHLIGHT_TS) + { + const char *forced_grammar = NULL; + char *grammar_from_type = NULL; + + if (type != NULL) + { + /* Manual syntax selection: reverse-lookup grammar name from display name */ + grammar_from_type = ts_config_reverse_lookup ("display-names", type); + forced_grammar = grammar_from_type; + } + + if (ts_init_for_file (edit, forced_grammar)) + { + edit_options.ts_available = TRUE; + g_free (grammar_from_type); + return; // tree-sitter successfully initialized + } + + g_free (grammar_from_type); + // TS failed for this file - fall through to legacy for this file only. + // Do NOT modify global state: other files may have valid grammars. + } +#endif + + // Fall back to legacy syntax highlighting f = mc_config_get_full_path (EDIT_SYNTAX_FILE); if (edit != NULL) r = edit_read_syntax_file (edit, pnames, f, vfs_path_as_str (edit->filename_vpath), @@ -1562,8 +1664,13 @@ edit_load_syntax (WEdit *edit, GPtrArray *pnames, const char *type) r = edit_read_syntax_file (NULL, pnames, f, NULL, "", NULL); if (r == -1) { +#ifdef HAVE_TREE_SITTER + // When tree-sitter is active, silently skip if legacy Syntax file is not found + edit_free_syntax_rules (edit); +#else edit_free_syntax_rules (edit); file_error_message (_ ("Cannot open file\n%s"), f); +#endif } else if (r != 0) { diff --git a/src/editor/syntax_ts.c b/src/editor/syntax_ts.c new file mode 100644 index 0000000000..d39a9f9e8c --- /dev/null +++ b/src/editor/syntax_ts.c @@ -0,0 +1,2067 @@ +/* + Editor tree-sitter syntax highlighting. + + Copyright (C) 2026 + Free Software Foundation, Inc. + + This file is part of the Midnight Commander. + + The Midnight Commander is free software: you can redistribute it + and/or modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + The Midnight Commander is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +/** \file + * \brief Source: editor tree-sitter syntax highlighting + */ + +#include + +#include +#include +#include + +#ifdef HAVE_TREE_SITTER + +#include +#include "ts-grammar-loader.h" + +#include "lib/global.h" +#include "lib/skin.h" +#include "lib/fileloc.h" // EDIT_SYNTAX_DIR, EDIT_SYNTAX_TS_DIR +#include "lib/tty/key.h" // is_idle() +#include "lib/strutil.h" // utf string functions +#include "lib/util.h" // whiteness + +#include "edit-impl.h" +#include "editwidget.h" +#include "syntax_ts.h" + +/*** global variables ****************************************************************************/ + +/*** file scope macro definitions ****************************************************************/ + +/*** file scope type declarations ****************************************************************/ + +// tree-sitter highlight cache entry: a range with an associated color +typedef struct +{ + uint32_t start_byte; + uint32_t end_byte; + int color; +} ts_highlight_entry_t; + +// Cached parser+query for a dynamically-injected language +typedef struct +{ + void *parser; // TSParser* + void *query; // TSQuery* -- highlight query + void *injection_query; // TSQuery* -- nested injection query (or NULL if none) + gboolean injection_query_loaded; // TRUE once we tried to load it (success or fail) +} ts_dynamic_lang_t; + + +/*** forward declarations (file scope functions) *************************************************/ + +/*** file scope variables ************************************************************************/ + +/* Per-grammar config loaded from config.ini files */ +typedef struct +{ + char *grammar_name; + char *display_name; + char *symbol_override; + char *wrapper_content_node; /* first token of wrapper= (or NULL) */ + char **wrapper_hosts; /* remaining tokens of wrapper= (NULL-terminated, or NULL) */ + char **extensions; /* from extensions= (NULL-terminated, or NULL) */ + char **filenames; /* from filenames= (NULL-terminated, or NULL) */ + char **shebangs; /* from shebangs= (NULL-terminated, or NULL) */ +} ts_grammar_config_t; + +/* Grammar registry: populated on first use by scanning config.ini files */ +static GHashTable *ts_grammar_configs = NULL; /* grammar_name -> ts_grammar_config_t* */ +static GHashTable *ts_ext_map = NULL; /* ".py" -> "python" */ +static GHashTable *ts_filename_map = NULL; /* "Makefile" -> "make" */ +static GHashTable *ts_shebang_map = NULL; /* "python3" -> "python" */ +static GHashTable *ts_display_to_grammar = NULL; /* "Python Program" -> "python" */ +static GHashTable *ts_wrapper_host_map = NULL; /* "yaml" -> "gotmpl" (wrapper name) */ +static GHashTable *ts_wrapper_node_map = NULL; /* "gotmpl" -> "text" (content node) */ +static gboolean ts_registry_loaded = FALSE; + +/* Color mappings loaded from [colors] sections of config.ini files. + Key: "grammar_name:capture_name" (e.g., "python:keyword", "default:comment") + Value: GINT_TO_POINTER(color_pair_id) */ +static GHashTable *ts_color_map = NULL; + +/* Color pair for ERROR nodes (red foreground). Allocated on first use, -1 = not yet allocated. */ +static int ts_error_color = -1; + +/* --------------------------------------------------------------------------------------------- */ +/*** file scope functions ************************************************************************/ +/* --------------------------------------------------------------------------------------------- */ + + +/** + * TSInput read callback: reads chunks of text from the edit buffer. + */ +static const char * +ts_input_read (void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read) +{ + static char buf[4096]; + WEdit *edit = (WEdit *) payload; + uint32_t i; + + (void) position; + + for (i = 0; i < sizeof (buf) && (off_t) (byte_index + i) < edit->buffer.size; i++) + buf[i] = edit_buffer_get_byte (&edit->buffer, (off_t) (byte_index + i)); + + *bytes_read = i; + return (i > 0) ? buf : NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Parse a color spec "foreground/background" and allocate an MC color pair. + * A foreground of "-" means use the default color. Returns the color pair ID. + */ +static int +ts_alloc_color_from_spec (const char *spec) +{ + char *buf, *fg, *bg, *sep; + tty_color_pair_t color; + int result; + + buf = g_strdup (spec); + g_strstrip (buf); + + sep = strchr (buf, '/'); + if (sep != NULL) + { + *sep = '\0'; + fg = buf; + bg = sep + 1; + if (*bg == '\0') + bg = NULL; + } + else + { + fg = buf; + bg = NULL; + } + + if (fg != NULL && strcmp (fg, "-") == 0) + fg = NULL; + + if (fg == NULL && bg == NULL) + result = EDITOR_NORMAL_COLOR; + else + { + color.fg = fg; + color.bg = bg; + color.attrs = NULL; + result = this_try_alloc_color_pair (&color); + } + + g_free (buf); + return result; +} + +/** + * Split a space-separated string into a NULL-terminated array. + * Returns newly allocated array (and strings within), or NULL if empty. + */ +static char ** +ts_split_values (const char *str) +{ + char **result; + int src, dst; + + if (str == NULL || *str == '\0') + return NULL; + + result = g_strsplit_set (str, " \t", -1); + + /* Remove empty strings from the array (from consecutive spaces) */ + src = 0; + dst = 0; + while (result[src] != NULL) + { + if (result[src][0] != '\0') + { + result[dst] = result[src]; + dst++; + } + else + g_free (result[src]); + src++; + } + result[dst] = NULL; + + if (dst == 0) + { + g_free (result); + return NULL; + } + + return result; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Load one config.ini file and populate the registry. + * grammar_name is the directory name (canonical grammar name). + */ +static void +ts_load_one_config (const char *config_path, const char *grammar_name) +{ + GKeyFile *kf; + ts_grammar_config_t *cfg; + char *value; + gchar **keys; + gsize k_count, ki; + + kf = g_key_file_new (); + if (!g_key_file_load_from_file (kf, config_path, G_KEY_FILE_NONE, NULL)) + { + g_key_file_free (kf); + return; + } + + cfg = g_new0 (ts_grammar_config_t, 1); + cfg->grammar_name = g_strdup (grammar_name); + + /* [grammar] section */ + value = g_key_file_get_value (kf, "grammar", "display-name", NULL); + if (value != NULL) + { + g_strstrip (value); + cfg->display_name = value; + } + + value = g_key_file_get_value (kf, "grammar", "symbol", NULL); + if (value != NULL) + { + g_strstrip (value); + cfg->symbol_override = value; + } + + value = g_key_file_get_value (kf, "grammar", "extensions", NULL); + if (value != NULL) + { + g_strstrip (value); + cfg->extensions = ts_split_values (value); + g_free (value); + } + + value = g_key_file_get_value (kf, "grammar", "filenames", NULL); + if (value != NULL) + { + g_strstrip (value); + cfg->filenames = ts_split_values (value); + g_free (value); + } + + value = g_key_file_get_value (kf, "grammar", "shebangs", NULL); + if (value != NULL) + { + g_strstrip (value); + cfg->shebangs = ts_split_values (value); + g_free (value); + } + + value = g_key_file_get_value (kf, "grammar", "wrapper", NULL); + if (value != NULL) + { + char **parts; + + g_strstrip (value); + parts = ts_split_values (value); + g_free (value); + + if (parts != NULL && parts[0] != NULL) + { + int count, i; + char **hosts; + + cfg->wrapper_content_node = g_strdup (parts[0]); + + /* Remaining tokens are host grammar names */ + count = 0; + while (parts[count + 1] != NULL) + count++; + if (count > 0) + { + hosts = g_new0 (char *, count + 1); + for (i = 0; i < count; i++) + hosts[i] = g_strdup (parts[i + 1]); + hosts[count] = NULL; + cfg->wrapper_hosts = hosts; + } + } + g_strfreev (parts); + } + + /* Populate lookup maps */ + g_hash_table_insert (ts_grammar_configs, g_strdup (grammar_name), cfg); + + if (cfg->display_name != NULL) + g_hash_table_insert (ts_display_to_grammar, g_strdup (cfg->display_name), + g_strdup (grammar_name)); + + if (cfg->extensions != NULL) + { + int i; + + for (i = 0; cfg->extensions[i] != NULL; i++) + g_hash_table_insert (ts_ext_map, g_strdup (cfg->extensions[i]), + g_strdup (grammar_name)); + } + + if (cfg->filenames != NULL) + { + int i; + + for (i = 0; cfg->filenames[i] != NULL; i++) + g_hash_table_insert (ts_filename_map, g_strdup (cfg->filenames[i]), + g_strdup (grammar_name)); + } + + if (cfg->shebangs != NULL) + { + int i; + + for (i = 0; cfg->shebangs[i] != NULL; i++) + g_hash_table_insert (ts_shebang_map, g_strdup (cfg->shebangs[i]), + g_strdup (grammar_name)); + } + + if (cfg->wrapper_hosts != NULL) + { + int i; + + g_hash_table_insert (ts_wrapper_node_map, g_strdup (grammar_name), + g_strdup (cfg->wrapper_content_node)); + for (i = 0; cfg->wrapper_hosts[i] != NULL; i++) + g_hash_table_insert (ts_wrapper_host_map, g_strdup (cfg->wrapper_hosts[i]), + g_strdup (grammar_name)); + } + + /* [colors] section -> populate ts_color_map */ + keys = g_key_file_get_keys (kf, "colors", &k_count, NULL); + if (keys != NULL) + { + for (ki = 0; ki < k_count; ki++) + { + char *color_value; + char *map_key; + int color_pair; + + color_value = g_key_file_get_value (kf, "colors", keys[ki], NULL); + if (color_value == NULL) + continue; + + color_pair = ts_alloc_color_from_spec (color_value); + g_free (color_value); + + map_key = g_strdup_printf ("%s:%s", grammar_name, keys[ki]); + g_hash_table_insert (ts_color_map, map_key, GINT_TO_POINTER (color_pair)); + } + g_strfreev (keys); + } + + g_key_file_free (kf); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Scan per-grammar config.ini files from both search paths. + * User-local entries take precedence over system entries. + */ +void +ts_load_grammar_registry (void) +{ + const char *dirs[2]; + int d; + + if (ts_registry_loaded) + return; + + ts_registry_loaded = TRUE; + + /* Allocate red color for ERROR nodes (parse failures) */ + ts_error_color = ts_alloc_color_from_spec ("red;"); + + ts_grammar_configs = g_hash_table_new (g_str_hash, g_str_equal); + ts_ext_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_filename_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_shebang_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_display_to_grammar = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_wrapper_host_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_wrapper_node_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + ts_color_map = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + dirs[0] = mc_config_get_data_path (); + dirs[1] = mc_global.share_data_dir; + + /* Scan system first, then user-local overwrites */ + for (d = 1; d >= 0; d--) + { + char *ts_dir; + GDir *dir; + const char *entry; + + ts_dir = g_build_filename (dirs[d], EDIT_SYNTAX_TS_DIR, (char *) NULL); + dir = g_dir_open (ts_dir, 0, NULL); + if (dir == NULL) + { + g_free (ts_dir); + continue; + } + + while ((entry = g_dir_read_name (dir)) != NULL) + { + char *config_path; + + config_path = g_build_filename (ts_dir, entry, "config.ini", (char *) NULL); + if (g_file_test (config_path, G_FILE_TEST_IS_REGULAR)) + ts_load_one_config (config_path, entry); + g_free (config_path); + } + + g_dir_close (dir); + g_free (ts_dir); + } +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Get the config record for a grammar. Triggers registry load on first use. + */ +static const ts_grammar_config_t * +ts_get_grammar_config (const char *grammar_name) +{ + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + return (const ts_grammar_config_t *) g_hash_table_lookup (ts_grammar_configs, grammar_name); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Get the symbol override for a grammar (for .so loading). + * Returns newly allocated string or NULL. + */ +char * +ts_get_symbol_override (const char *grammar_name) +{ + const ts_grammar_config_t *cfg = ts_get_grammar_config (grammar_name); + + if (cfg != NULL && cfg->symbol_override != NULL) + return g_strdup (cfg->symbol_override); + return NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up the color for a tree-sitter capture name. + * Uses longest-prefix matching: tries the full name first, then strips + * the last ".suffix" and retries until a match is found or exhausted. + * + * Lookup order for each prefix: + * 1. [grammar_name]:capture (per-grammar color) + * 2. [default]:capture (global default, if a default/ config exists) + */ +static int +ts_capture_name_to_color (const char *capture_name, const char *grammar_name) +{ + char *name; + char *dot; + + if (ts_color_map == NULL) + return EDITOR_NORMAL_COLOR; + + name = g_strdup (capture_name); + + for (;;) + { + gpointer value; + char *key; + + /* Try grammar-specific color first */ + if (grammar_name != NULL) + { + key = g_strdup_printf ("%s:%s", grammar_name, name); + if (g_hash_table_lookup_extended (ts_color_map, key, NULL, &value)) + { + g_free (key); + g_free (name); + return GPOINTER_TO_INT (value); + } + g_free (key); + } + + /* Try default section */ + key = g_strdup_printf ("default:%s", name); + if (g_hash_table_lookup_extended (ts_color_map, key, NULL, &value)) + { + g_free (key); + g_free (name); + return GPOINTER_TO_INT (value); + } + g_free (key); + + /* Strip the last .suffix and retry */ + dot = strrchr (name, '.'); + if (dot == NULL) + break; + *dot = '\0'; + } + + g_free (name); + return EDITOR_NORMAL_COLOR; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Reverse lookup: given a display name, return the grammar name. + * Returns newly allocated string or NULL. + */ +char * +ts_config_reverse_lookup (const char *config_name, const char *display_value) +{ + const char *grammar; + + (void) config_name; /* kept for API compatibility */ + + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + grammar = (const char *) g_hash_table_lookup (ts_display_to_grammar, display_value); + return (grammar != NULL) ? g_strdup (grammar) : NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Extract interpreter name from a shebang line. + * "#!/usr/bin/env python3" -> "python3" + * "#!/usr/bin/python" -> "python" + * Returns newly allocated string or NULL. + */ +static char * +ts_extract_interpreter (const char *first_line) +{ + const char *p; + const char *word_start; + const char *word_end; + const char *basename_start; + + if (first_line == NULL || first_line[0] != '#' || first_line[1] != '!') + return NULL; + + p = first_line + 2; + + /* Skip whitespace after #! */ + while (*p != '\0' && whiteness (*p)) + p++; + + if (*p == '\0') + return NULL; + + /* Extract the first word (path to interpreter) */ + word_start = p; + while (*p != '\0' && !whiteness (*p)) + p++; + word_end = p; + + /* Get basename of the path */ + basename_start = word_end - 1; + while (basename_start > word_start && *(basename_start - 1) != PATH_SEP) + basename_start--; + + /* If the basename is "env", skip to the next word */ + if ((word_end - basename_start) == 3 && strncmp (basename_start, "env", 3) == 0) + { + /* Skip whitespace */ + while (*p != '\0' && whiteness (*p)) + p++; + + if (*p == '\0') + return NULL; + + /* Skip any flags (words starting with -) */ + while (*p == '-') + { + while (*p != '\0' && !whiteness (*p)) + p++; + while (*p != '\0' && whiteness (*p)) + p++; + } + + if (*p == '\0') + return NULL; + + word_start = p; + while (*p != '\0' && !whiteness (*p)) + p++; + word_end = p; + + /* Get basename again */ + basename_start = word_end - 1; + while (basename_start > word_start && *(basename_start - 1) != PATH_SEP) + basename_start--; + } + + if (basename_start >= word_end) + return NULL; + + return g_strndup (basename_start, (gsize) (word_end - basename_start)); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Find a matching grammar for the given filename and first line. + * Precedence: exact filename > shebang > extension. + * On match, fills grammar_name and display_name (newly allocated strings). + * Returns TRUE if a match was found. + */ +static gboolean +ts_find_grammar (const char *filename, const char *first_line, + char **grammar_name, char **display_name) +{ + const char *basename; + const char *match; + const ts_grammar_config_t *cfg; + + if (filename == NULL) + return FALSE; + + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + *grammar_name = NULL; + *display_name = NULL; + + basename = strrchr (filename, PATH_SEP); + basename = (basename != NULL) ? basename + 1 : filename; + + /* 1. Exact filename match (most specific) */ + match = (const char *) g_hash_table_lookup (ts_filename_map, basename); + if (match != NULL) + { + *grammar_name = g_strdup (match); + cfg = ts_get_grammar_config (match); + *display_name = (cfg != NULL && cfg->display_name != NULL) + ? g_strdup (cfg->display_name) : g_strdup (match); + return TRUE; + } + + /* 2. Shebang match (content-based) */ + if (first_line != NULL && first_line[0] == '#' && first_line[1] == '!') + { + char *interpreter = ts_extract_interpreter (first_line); + + if (interpreter != NULL) + { + match = (const char *) g_hash_table_lookup (ts_shebang_map, interpreter); + g_free (interpreter); + if (match != NULL) + { + *grammar_name = g_strdup (match); + cfg = ts_get_grammar_config (match); + *display_name = (cfg != NULL && cfg->display_name != NULL) + ? g_strdup (cfg->display_name) : g_strdup (match); + return TRUE; + } + } + } + + /* 3. Extension match (least specific) */ + { + const char *dot = strrchr (basename, '.'); + + if (dot != NULL) + { + match = (const char *) g_hash_table_lookup (ts_ext_map, dot); + if (match != NULL) + { + *grammar_name = g_strdup (match); + cfg = ts_get_grammar_config (match); + *display_name = (cfg != NULL && cfg->display_name != NULL) + ? g_strdup (cfg->display_name) : g_strdup (match); + return TRUE; + } + } + } + + return FALSE; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up the wrappers registry to find a wrapper grammar for a given host. + * Returns the wrapper grammar name (newly allocated) and fills content_node. + * Returns NULL if no wrapper handles this host. + */ +static char * +ts_find_wrapper_for_host (const char *host_grammar, char **content_node) +{ + const char *wrapper; + const char *node; + + *content_node = NULL; + + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + wrapper = (const char *) g_hash_table_lookup (ts_wrapper_host_map, host_grammar); + if (wrapper == NULL) + return NULL; + + node = (const char *) g_hash_table_lookup (ts_wrapper_node_map, wrapper); + if (node == NULL) + return NULL; + + *content_node = g_strdup (node); + return g_strdup (wrapper); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up the wrappers registry to find the content node for a wrapper grammar. + * Returns the content node name (newly allocated), or NULL if not a wrapper. + */ +static char * +ts_find_wrapper_content_node (const char *wrapper_grammar) +{ + const char *node; + + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + node = (const char *) g_hash_table_lookup (ts_wrapper_node_map, wrapper_grammar); + return (node != NULL) ? g_strdup (node) : NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Build and install an injection query that injects host_grammar into + * content_node nodes of the current wrapper grammar. + * Example: inject "yaml" into "text" nodes produces: + * ((text) @injection.content (#set! injection.language "yaml")) + */ +static void +ts_setup_wrapper_injection (WEdit *edit, const TSLanguage *lang, + const char *content_node, const char *host_grammar) +{ + char *inj_src; + uint32_t inj_len; + uint32_t eo; + TSQueryError et; + TSQuery *inj_query; + + inj_src = g_strdup_printf ("((%s) @injection.content (#set! injection.language \"%s\"))", + content_node, host_grammar); + inj_len = (uint32_t) strlen (inj_src); + inj_query = ts_query_new (lang, inj_src, inj_len, &eo, &et); + g_free (inj_src); + + if (inj_query != NULL) + { + edit->ts.injection_query = inj_query; + edit->ts.injection_lang_cache = g_hash_table_new (g_str_hash, g_str_equal); + } +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Load a query file from per-grammar directory. Search order: + * 1. User: ~/.local/share/mc/syntax-ts// + * 2. System: $(datadir)/mc/syntax-ts// + * Returns a newly allocated string with the file contents, or NULL on failure. + */ +static char * +ts_load_query_file (const char *grammar_name, const char *query_filename, uint32_t *out_len) +{ + const char *base_dirs[2]; + char *contents = NULL; + gsize len = 0; + int b; + + base_dirs[0] = mc_config_get_data_path (); + base_dirs[1] = mc_global.share_data_dir; + + for (b = 0; b < 2; b++) + { + char *path; + + path = g_build_filename (base_dirs[b], EDIT_SYNTAX_TS_DIR, grammar_name, + query_filename, (char *) NULL); + if (g_file_get_contents (path, &contents, &len, NULL)) + { + g_free (path); + *out_len = (uint32_t) len; + return contents; + } + g_free (path); + } + + *out_len = 0; + return NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Extract the value of a #set! predicate for a given key from a query pattern. + * Returns the value string (owned by the query, do not free), or NULL if not found. + */ +static const char * +ts_get_set_predicate (TSQuery *query, uint32_t pattern_index, const char *key) +{ + uint32_t step_count; + const TSQueryPredicateStep *steps; + uint32_t i; + + steps = ts_query_predicates_for_pattern (query, pattern_index, &step_count); + + for (i = 0; i + 3 < step_count; i++) + { + /* #set! pattern: String("set!") String(key) String(value) Done */ + if (steps[i].type == TSQueryPredicateStepTypeString + && steps[i + 1].type == TSQueryPredicateStepTypeString + && steps[i + 2].type == TSQueryPredicateStepTypeString + && steps[i + 3].type == TSQueryPredicateStepTypeDone) + { + uint32_t len; + const char *name; + + name = ts_query_string_value_for_id (query, steps[i].value_id, &len); + if (strcmp (name, "set!") == 0) + { + const char *prop; + + prop = ts_query_string_value_for_id (query, steps[i + 1].value_id, &len); + if (strcmp (prop, key) == 0) + return ts_query_string_value_for_id (query, steps[i + 2].value_id, &len); + } + } + } + + return NULL; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Read capture text from the edit buffer for a given node. + * Returns a newly allocated string, or NULL if the node is too large. + */ +static char * +ts_read_node_text (WEdit *edit, TSNode node) +{ + uint32_t start, end, len, i; + char *buf; + + start = ts_node_start_byte (node); + end = ts_node_end_byte (node); + len = end - start; + + if (len == 0 || len > 256) + return NULL; + + buf = g_new (char, len + 1); + for (i = 0; i < len; i++) + buf[i] = (char) edit_buffer_get_byte (&edit->buffer, (off_t) (start + i)); + buf[len] = '\0'; + + return buf; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Evaluate filter predicates (#eq?, #any-of?, #not-any-of?, #match?) for a match. + * Returns TRUE if the match passes all predicates (should be used). + * Returns FALSE if any predicate rejects the match (should be skipped). + * + * Predicate format in TSQueryPredicateStep arrays: + * #eq? @capture "value" Done + * #any-of? @capture "v1" "v2" ... Done + * #not-any-of? @capture "v1" "v2" ... Done + * #match? @capture "regex" Done + */ +static gboolean +ts_evaluate_match_predicates (TSQuery *query, const TSQueryMatch *match, WEdit *edit) +{ + uint32_t step_count; + const TSQueryPredicateStep *steps; + uint32_t i; + + steps = ts_query_predicates_for_pattern (query, match->pattern_index, &step_count); + if (step_count == 0) + return TRUE; + + for (i = 0; i < step_count;) + { + uint32_t len; + const char *pred_name; + + /* Each predicate starts with a String (the predicate name) */ + if (steps[i].type != TSQueryPredicateStepTypeString) + { + /* Skip to next Done */ + while (i < step_count && steps[i].type != TSQueryPredicateStepTypeDone) + i++; + if (i < step_count) + i++; /* skip Done */ + continue; + } + + pred_name = ts_query_string_value_for_id (query, steps[i].value_id, &len); + + if (strcmp (pred_name, "eq?") == 0 && i + 3 <= step_count) + { + /* #eq? @capture "value" Done */ + if (steps[i + 1].type == TSQueryPredicateStepTypeCapture + && steps[i + 2].type == TSQueryPredicateStepTypeString) + { + uint32_t cap_idx = steps[i + 1].value_id; + const char *expected; + uint32_t ci; + gboolean found = FALSE; + + expected = ts_query_string_value_for_id (query, steps[i + 2].value_id, &len); + + for (ci = 0; ci < match->capture_count; ci++) + { + if (match->captures[ci].index == cap_idx) + { + char *text = ts_read_node_text (edit, match->captures[ci].node); + if (text != NULL) + { + found = (strcmp (text, expected) == 0); + g_free (text); + } + break; + } + } + + if (!found) + return FALSE; + } + } + else if (strcmp (pred_name, "any-of?") == 0 && i + 2 <= step_count) + { + /* #any-of? @capture "v1" "v2" ... Done */ + if (steps[i + 1].type == TSQueryPredicateStepTypeCapture) + { + uint32_t cap_idx = steps[i + 1].value_id; + char *text = NULL; + gboolean matched = FALSE; + uint32_t ci, j; + + /* Find capture text */ + for (ci = 0; ci < match->capture_count; ci++) + { + if (match->captures[ci].index == cap_idx) + { + text = ts_read_node_text (edit, match->captures[ci].node); + break; + } + } + + if (text != NULL) + { + /* Check against each value string */ + for (j = i + 2; j < step_count && steps[j].type == TSQueryPredicateStepTypeString; + j++) + { + const char *val; + val = ts_query_string_value_for_id (query, steps[j].value_id, &len); + if (strcmp (text, val) == 0) + { + matched = TRUE; + break; + } + } + g_free (text); + } + + if (!matched) + return FALSE; + } + } + else if (strcmp (pred_name, "not-any-of?") == 0 && i + 2 <= step_count) + { + /* #not-any-of? @capture "v1" "v2" ... Done */ + if (steps[i + 1].type == TSQueryPredicateStepTypeCapture) + { + uint32_t cap_idx = steps[i + 1].value_id; + char *text = NULL; + gboolean rejected = FALSE; + uint32_t ci, j; + + for (ci = 0; ci < match->capture_count; ci++) + { + if (match->captures[ci].index == cap_idx) + { + text = ts_read_node_text (edit, match->captures[ci].node); + break; + } + } + + if (text != NULL) + { + for (j = i + 2; j < step_count && steps[j].type == TSQueryPredicateStepTypeString; + j++) + { + const char *val; + val = ts_query_string_value_for_id (query, steps[j].value_id, &len); + if (strcmp (text, val) == 0) + { + rejected = TRUE; + break; + } + } + g_free (text); + } + + if (rejected) + return FALSE; + } + } + else if (strcmp (pred_name, "match?") == 0 && i + 3 <= step_count) + { + /* #match? @capture "regex" Done */ + if (steps[i + 1].type == TSQueryPredicateStepTypeCapture + && steps[i + 2].type == TSQueryPredicateStepTypeString) + { + uint32_t cap_idx = steps[i + 1].value_id; + const char *pattern; + gboolean matched = FALSE; + uint32_t ci; + + pattern = ts_query_string_value_for_id (query, steps[i + 2].value_id, &len); + + for (ci = 0; ci < match->capture_count; ci++) + { + if (match->captures[ci].index == cap_idx) + { + char *text = ts_read_node_text (edit, match->captures[ci].node); + if (text != NULL) + { + matched = g_regex_match_simple (pattern, text, 0, 0); + g_free (text); + } + break; + } + } + + if (!matched) + return FALSE; + } + } + /* Skip #set! and other predicates we don't filter on */ + + /* Advance to next Done */ + while (i < step_count && steps[i].type != TSQueryPredicateStepTypeDone) + i++; + if (i < step_count) + i++; /* skip Done */ + } + + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Initialize injection query for a grammar by loading -injections.scm. + * Called after the primary grammar is initialized. + * Failure is non-fatal — highlighting works without injections. + */ +static void +ts_init_injections (WEdit *edit, const char *grammar_name, const TSLanguage *lang) +{ + char *query_src; + uint32_t query_len; + TSQuery *inj_query; + uint32_t error_offset; + TSQueryError error_type; + + query_src = ts_load_query_file (grammar_name, "injections.scm", &query_len); + + if (query_src == NULL) + return; /* no injections for this grammar */ + + inj_query = ts_query_new (lang, query_src, query_len, &error_offset, &error_type); + g_free (query_src); + + if (inj_query == NULL) + return; /* query compilation failed */ + + /* Store the injection query and a cache for dynamic languages */ + edit->ts.injection_query = inj_query; + edit->ts.injection_lang_cache = g_hash_table_new (g_str_hash, g_str_equal); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Try to initialize tree-sitter for the given edit widget. + * Returns TRUE on success, FALSE if we should fall back to legacy highlighting. + * + * Grammar discovery: + * 1. Config file ts-grammars maps filename regex -> grammar_name + * 2. Grammar looked up in static registry (compiled-in) + * 3. Query file by convention: -highlights.scm + */ +gboolean +ts_init_for_file (WEdit *edit, const char *forced_grammar) +{ + const char *filename; + char *grammar_name = NULL; + char *display_name = NULL; + const TSLanguage *lang; + TSParser *parser; + TSTree *tree; + TSInput input; + char *query_src; + uint32_t query_len; + uint32_t error_offset; + TSQueryError error_type; + TSQuery *query; + + /* Ensure grammar registry (and colors) are loaded */ + if (!ts_registry_loaded) + ts_load_grammar_registry (); + + filename = vfs_path_as_str (edit->filename_vpath); + + if (forced_grammar != NULL) + { + /* Manual grammar selection — skip auto-detection */ + grammar_name = g_strdup (forced_grammar); + { + const ts_grammar_config_t *cfg = ts_get_grammar_config (forced_grammar); + display_name = (cfg != NULL && cfg->display_name != NULL) + ? g_strdup (cfg->display_name) : g_strdup (forced_grammar); + } + } + else if (!ts_find_grammar (filename, get_first_editor_line (edit), &grammar_name, + &display_name)) + return FALSE; + + // Look up grammar in the static registry + lang = ts_grammar_registry_lookup (grammar_name); + if (lang == NULL) + { + g_free (grammar_name); + g_free (display_name); + return FALSE; + } + + // Create parser and set language + parser = ts_parser_new (); + if (!ts_parser_set_language (parser, lang)) + { + ts_parser_delete (parser); + g_free (grammar_name); + g_free (display_name); + return FALSE; + } + + // Parse the buffer + input.payload = edit; + input.read = ts_input_read; + input.encoding = TSInputEncodingUTF8; + + tree = ts_parser_parse (parser, NULL, input); + if (tree == NULL) + { + ts_parser_delete (parser); + g_free (grammar_name); + g_free (display_name); + return FALSE; + } + + /* If the parser produced an ERROR root (catastrophic parse failure), check + the wrappers config to see if a wrapper grammar can handle this file. + For example, a .yaml file with Go template syntax fails to parse as YAML, + so a wrapper like gotmpl is tried. If it succeeds, the original grammar + is injected into the wrapper's content nodes (e.g. "text"). */ + if (ts_node_is_error (ts_tree_root_node (tree))) + { + char *content_node = NULL; + char *wrapper_name; + + wrapper_name = ts_find_wrapper_for_host (grammar_name, &content_node); + if (wrapper_name != NULL) + { + const TSLanguage *wrapper_lang; + + wrapper_lang = ts_grammar_registry_lookup (wrapper_name); + if (wrapper_lang != NULL && ts_parser_set_language (parser, wrapper_lang)) + { + TSTree *wrapper_tree; + + wrapper_tree = ts_parser_parse (parser, NULL, input); + if (wrapper_tree != NULL + && !ts_node_is_error (ts_tree_root_node (wrapper_tree))) + { + /* Inject the original grammar into the wrapper's content nodes */ + ts_setup_wrapper_injection (edit, wrapper_lang, content_node, + grammar_name); + + ts_tree_delete (tree); + tree = wrapper_tree; + lang = wrapper_lang; + g_free (grammar_name); + grammar_name = g_strdup (wrapper_name); + g_free (display_name); + { + const ts_grammar_config_t *wcfg = ts_get_grammar_config (wrapper_name); + display_name = (wcfg != NULL && wcfg->display_name != NULL) + ? g_strdup (wcfg->display_name) : g_strdup (wrapper_name); + } + } + else + { + /* Wrapper didn't help — restore original language and tree */ + if (wrapper_tree != NULL) + ts_tree_delete (wrapper_tree); + ts_parser_set_language (parser, lang); + } + } + + g_free (wrapper_name); + } + + g_free (content_node); + } + + // Load and compile highlight query + query_src = ts_load_query_file (grammar_name, "highlights.scm", &query_len); + + if (query_src == NULL) + { + ts_tree_delete (tree); + ts_parser_delete (parser); + g_free (grammar_name); + g_free (display_name); + return FALSE; + } + + query = ts_query_new (lang, query_src, query_len, &error_offset, &error_type); + g_free (query_src); + + if (query == NULL) + { + ts_tree_delete (tree); + ts_parser_delete (parser); + g_free (grammar_name); + g_free (display_name); + return FALSE; + } + + // All good -- store in edit widget + edit->ts.parser = parser; + edit->ts.tree = tree; + edit->ts.highlight_query = query; + edit->ts.highlights = g_array_new (FALSE, FALSE, sizeof (ts_highlight_entry_t)); + edit->ts.highlights_start = -1; + edit->ts.highlights_end = -1; + edit->ts.grammar_name = g_strdup (grammar_name); + edit->ts.active = TRUE; + edit->ts.need_reparse = FALSE; + + // Try to initialize language injection (e.g., markdown inline within markdown block) + // Failure is non-fatal — highlighting works without injection. + // Skip if injection was already set up (e.g., by gotmpl fallback). + if (edit->ts.injection_query == NULL) + ts_init_injections (edit, grammar_name, lang); + + /* For wrapper grammars with a compound extension (e.g., .md.gotmpl, .yaml.tmpl), + inject the host language into the wrapper's content nodes based on the + preceding extension. The wrappers config defines which grammars are wrappers + and what content node they use. */ + if (edit->ts.injection_query == NULL) + { + char *content_node; + + content_node = ts_find_wrapper_content_node (grammar_name); + if (content_node != NULL) + { + const char *base; + const char *last_dot; + + base = strrchr (filename, PATH_SEP); + base = (base != NULL) ? base + 1 : filename; + last_dot = strrchr (base, '.'); + + /* Look for a second extension before the wrapper extension */ + if (last_dot != NULL && last_dot > base) + { + const char *prev_dot; + char ext_buf[32]; + ptrdiff_t ext_len; + + prev_dot = g_strrstr_len (base, (gssize) (last_dot - base), "."); + if (prev_dot != NULL) + { + ext_len = last_dot - prev_dot; + if (ext_len > 0 && ext_len < (ptrdiff_t) sizeof (ext_buf)) + { + char *host_grammar; + + memcpy (ext_buf, prev_dot, (size_t) ext_len); + ext_buf[ext_len] = '\0'; + + host_grammar = g_hash_table_lookup (ts_ext_map, ext_buf) != NULL + ? g_strdup ((const char *) g_hash_table_lookup (ts_ext_map, ext_buf)) + : NULL; + if (host_grammar != NULL) + { + ts_setup_wrapper_injection (edit, lang, content_node, + host_grammar); + g_free (host_grammar); + } + } + } + } + + g_free (content_node); + } + } + + g_free (edit->syntax_type); + edit->syntax_type = display_name; // takes ownership + + g_free (grammar_name); + + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Free all tree-sitter resources associated with the edit widget. + */ +void +ts_free (WEdit *edit) +{ + // Free injection resources + if (edit->ts.injection_query != NULL) + { + ts_query_delete ((TSQuery *) edit->ts.injection_query); + edit->ts.injection_query = NULL; + } + + if (edit->ts.injection_lang_cache != NULL) + { + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init (&iter, edit->ts.injection_lang_cache); + while (g_hash_table_iter_next (&iter, &key, &value)) + { + ts_dynamic_lang_t *dl = (ts_dynamic_lang_t *) value; + + if (dl->query != NULL) + ts_query_delete ((TSQuery *) dl->query); + if (dl->injection_query != NULL) + ts_query_delete ((TSQuery *) dl->injection_query); + if (dl->parser != NULL) + ts_parser_delete ((TSParser *) dl->parser); + g_free (dl); + g_free (key); + } + g_hash_table_destroy (edit->ts.injection_lang_cache); + edit->ts.injection_lang_cache = NULL; + } + + // Free primary resources + if (edit->ts.highlight_query != NULL) + { + ts_query_delete ((TSQuery *) edit->ts.highlight_query); + edit->ts.highlight_query = NULL; + } + + if (edit->ts.tree != NULL) + { + ts_tree_delete ((TSTree *) edit->ts.tree); + edit->ts.tree = NULL; + } + + if (edit->ts.parser != NULL) + { + ts_parser_delete ((TSParser *) edit->ts.parser); + edit->ts.parser = NULL; + } + + if (edit->ts.highlights != NULL) + { + g_array_free (edit->ts.highlights, TRUE); + edit->ts.highlights = NULL; + } + + g_free (edit->ts.grammar_name); + edit->ts.grammar_name = NULL; + edit->ts.highlights_start = -1; + edit->ts.highlights_end = -1; + edit->ts.active = FALSE; + + /* Clear the grammar registry and color map so they're reloaded on next init. + This is needed because tty_color_free_temp() (called by + edit_free_syntax_rules) invalidates all temporary color pairs, + including the ones stored in ts_color_map. */ + if (ts_color_map != NULL) + { + g_hash_table_destroy (ts_color_map); + ts_color_map = NULL; + ts_error_color = -1; + } + if (ts_grammar_configs != NULL) + { + g_hash_table_destroy (ts_grammar_configs); + ts_grammar_configs = NULL; + } + if (ts_ext_map != NULL) + { + g_hash_table_destroy (ts_ext_map); + ts_ext_map = NULL; + } + if (ts_filename_map != NULL) + { + g_hash_table_destroy (ts_filename_map); + ts_filename_map = NULL; + } + if (ts_shebang_map != NULL) + { + g_hash_table_destroy (ts_shebang_map); + ts_shebang_map = NULL; + } + if (ts_display_to_grammar != NULL) + { + g_hash_table_destroy (ts_display_to_grammar); + ts_display_to_grammar = NULL; + } + if (ts_wrapper_host_map != NULL) + { + g_hash_table_destroy (ts_wrapper_host_map); + ts_wrapper_host_map = NULL; + } + if (ts_wrapper_node_map != NULL) + { + g_hash_table_destroy (ts_wrapper_node_map); + ts_wrapper_node_map = NULL; + } + ts_registry_loaded = FALSE; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Run a highlight query on a tree and append results to the highlights array. + * grammar_name is used for per-grammar color lookup in colors.ini. + * Evaluates filter predicates (#eq?, #match?, etc.) to skip non-matching captures. + */ +static void +ts_run_query_into_highlights (TSQuery *query, TSTree *tree, uint32_t range_start, + uint32_t range_end, GArray *highlights, + const char *grammar_name, WEdit *edit) +{ + TSNode root; + TSQueryCursor *cursor; + TSQueryMatch match; + + root = ts_tree_root_node (tree); + + cursor = ts_query_cursor_new (); + ts_query_cursor_set_byte_range (cursor, range_start, range_end); + ts_query_cursor_exec (cursor, query, root); + + while (ts_query_cursor_next_match (cursor, &match)) + { + uint32_t ci; + + /* Evaluate filter predicates (#eq?, #match?, #any-of?, etc.) */ + if (edit != NULL && !ts_evaluate_match_predicates (query, &match, edit)) + continue; + + for (ci = 0; ci < match.capture_count; ci++) + { + TSQueryCapture cap = match.captures[ci]; + uint32_t cap_name_len; + const char *cap_name; + ts_highlight_entry_t entry; + + cap_name = ts_query_capture_name_for_id (query, cap.index, &cap_name_len); + + entry.start_byte = ts_node_start_byte (cap.node); + entry.end_byte = ts_node_end_byte (cap.node); + entry.color = ts_capture_name_to_color (cap_name, grammar_name); + + // Only include entries that overlap with our range + if (entry.end_byte > range_start && entry.start_byte < range_end) + g_array_append_val (highlights, entry); + } + } + + ts_query_cursor_delete (cursor); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up or create a cached parser+query for a dynamically-injected language. + * Returns NULL if the language is unknown or the query fails to compile. + */ +static ts_dynamic_lang_t * +ts_get_dynamic_lang (GHashTable *lang_cache, const char *lang_name) +{ + ts_dynamic_lang_t *dl; + const TSLanguage *lang; + TSParser *parser; + char *query_src; + uint32_t query_len; + uint32_t error_offset; + TSQueryError error_type; + TSQuery *query; + + dl = (ts_dynamic_lang_t *) g_hash_table_lookup (lang_cache, lang_name); + if (dl != NULL) + return dl->parser != NULL ? dl : NULL; + + // Not cached yet — try to create + lang = ts_grammar_registry_lookup (lang_name); + if (lang == NULL) + { + // Cache a NULL entry so we don't retry + dl = g_new0 (ts_dynamic_lang_t, 1); + g_hash_table_insert (lang_cache, g_strdup (lang_name), dl); + return NULL; + } + + parser = ts_parser_new (); + if (!ts_parser_set_language (parser, lang)) + { + ts_parser_delete (parser); + dl = g_new0 (ts_dynamic_lang_t, 1); + g_hash_table_insert (lang_cache, g_strdup (lang_name), dl); + return NULL; + } + + query_src = ts_load_query_file (lang_name, "highlights.scm", &query_len); + + if (query_src == NULL) + { + ts_parser_delete (parser); + dl = g_new0 (ts_dynamic_lang_t, 1); + g_hash_table_insert (lang_cache, g_strdup (lang_name), dl); + return NULL; + } + + query = ts_query_new (lang, query_src, query_len, &error_offset, &error_type); + g_free (query_src); + + if (query == NULL) + { + ts_parser_delete (parser); + dl = g_new0 (ts_dynamic_lang_t, 1); + g_hash_table_insert (lang_cache, g_strdup (lang_name), dl); + return NULL; + } + + dl = g_new0 (ts_dynamic_lang_t, 1); + dl->parser = parser; + dl->query = query; + g_hash_table_insert (lang_cache, g_strdup (lang_name), dl); + return dl; +} + +/** + * Recursively collect ERROR nodes within a byte range and append red highlights. + */ +static void +ts_collect_error_highlights (TSNode node, uint32_t range_start, uint32_t range_end, + GArray *highlights) +{ + uint32_t start, end, i, child_count; + + start = ts_node_start_byte (node); + end = ts_node_end_byte (node); + + /* Skip nodes entirely outside the range */ + if (end <= range_start || start >= range_end) + return; + + if (ts_node_is_error (node)) + { + ts_highlight_entry_t entry; + + entry.start_byte = start; + entry.end_byte = end; + entry.color = ts_error_color; + g_array_append_val (highlights, entry); + return; /* Don't recurse into ERROR children */ + } + + child_count = ts_node_child_count (node); + for (i = 0; i < child_count; i++) + ts_collect_error_highlights (ts_node_child (node, i), range_start, range_end, highlights); +} + +/* --------------------------------------------------------------------------------------------- */ + +/* Maximum nesting depth for recursive injection (e.g., gotmpl -> markdown -> python). + Prevents infinite loops from circular injection configurations. */ +#define TS_MAX_INJECTION_DEPTH 3 + +/** + * Parse an injected language within a content node range and run its highlights. + * If the injected language has its own injections.scm, recurse up to max_depth. + */ +static void +ts_inject_and_highlight (const char *lang_name, TSNode content_node, TSInput input, + uint32_t range_start, uint32_t range_end, GArray *highlights, + GHashTable *lang_cache, WEdit *edit, int depth) +{ + ts_dynamic_lang_t *dl; + TSRange r; + TSTree *inject_tree; + + dl = ts_get_dynamic_lang (lang_cache, lang_name); + if (dl == NULL) + return; + + r.start_point = ts_node_start_point (content_node); + r.end_point = ts_node_end_point (content_node); + r.start_byte = ts_node_start_byte (content_node); + r.end_byte = ts_node_end_byte (content_node); + + ts_parser_set_included_ranges ((TSParser *) dl->parser, &r, 1); + inject_tree = ts_parser_parse ((TSParser *) dl->parser, NULL, input); + if (inject_tree == NULL) + return; + + ts_run_query_into_highlights ((TSQuery *) dl->query, inject_tree, + range_start, range_end, highlights, lang_name, edit); + + /* Recurse: if the injected language has its own injections.scm, + process nested injections (e.g., markdown -> markdown_inline, + markdown -> python for fenced code blocks). */ + if (depth > 0) + { + const TSLanguage *inj_lang; + + inj_lang = ts_grammar_registry_lookup (lang_name); + if (inj_lang != NULL) + { + TSQuery *nested_inj_query; + + /* Lazy-load the nested injection query and cache it on the + dynamic language entry to avoid recompiling on every call. */ + if (!dl->injection_query_loaded) + { + char *nested_inj_src; + uint32_t nested_inj_len; + + nested_inj_src = ts_load_query_file (lang_name, "injections.scm", &nested_inj_len); + + if (nested_inj_src != NULL) + { + uint32_t eo; + TSQueryError et; + + dl->injection_query = + ts_query_new (inj_lang, nested_inj_src, nested_inj_len, &eo, &et); + g_free (nested_inj_src); + } + dl->injection_query_loaded = TRUE; + } + + nested_inj_query = (TSQuery *) dl->injection_query; + + if (nested_inj_query != NULL) + { + TSNode inj_root; + TSQueryCursor *nested_cursor; + TSQueryMatch nested_match; + + inj_root = ts_tree_root_node (inject_tree); + nested_cursor = ts_query_cursor_new (); + ts_query_cursor_set_byte_range (nested_cursor, range_start, range_end); + ts_query_cursor_exec (nested_cursor, nested_inj_query, inj_root); + + while (ts_query_cursor_next_match (nested_cursor, &nested_match)) + { + TSNode nested_content = { .id = NULL }; + TSNode nested_lang_node = { .id = NULL }; + const char *nested_static_lang = NULL; + uint32_t nci; + + if (!ts_evaluate_match_predicates (nested_inj_query, &nested_match, edit)) + continue; + + for (nci = 0; nci < nested_match.capture_count; nci++) + { + uint32_t nlen; + const char *ncap; + + ncap = ts_query_capture_name_for_id (nested_inj_query, + nested_match.captures[nci].index, + &nlen); + if (strcmp (ncap, "injection.content") == 0) + nested_content = nested_match.captures[nci].node; + else if (strcmp (ncap, "injection.language") == 0) + nested_lang_node = nested_match.captures[nci].node; + } + + if (ts_node_is_null (nested_content)) + continue; + + nested_static_lang = + ts_get_set_predicate (nested_inj_query, nested_match.pattern_index, + "injection.language"); + + if (nested_static_lang != NULL) + { + ts_inject_and_highlight (nested_static_lang, nested_content, input, + range_start, range_end, highlights, + lang_cache, edit, depth - 1); + } + else if (!ts_node_is_null (nested_lang_node)) + { + uint32_t ls = ts_node_start_byte (nested_lang_node); + uint32_t le = ts_node_end_byte (nested_lang_node); + uint32_t ll = le - ls; + + if (ll > 0 && ll < 64) + { + char lbuf[64]; + uint32_t li; + char *s, *e; + + for (li = 0; li < ll; li++) + lbuf[li] = (char) edit_buffer_get_byte (&edit->buffer, + (off_t) (ls + li)); + lbuf[ll] = '\0'; + + s = lbuf; + while (*s == ' ' || *s == '\t') + s++; + e = s + strlen (s); + while (e > s && (e[-1] == ' ' || e[-1] == '\t' || e[-1] == '\n' + || e[-1] == '\r')) + e--; + *e = '\0'; + + if (*s != '\0') + ts_inject_and_highlight (s, nested_content, input, + range_start, range_end, highlights, + lang_cache, edit, depth - 1); + } + } + } + + ts_query_cursor_delete (nested_cursor); + } + } + } + + ts_tree_delete (inject_tree); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Rebuild the highlight cache for the given byte range. + * Runs the highlight query and collects (start_byte, end_byte, color) entries. + * If injection is active, also runs the injection query on target node ranges. + */ +void +ts_rebuild_highlight_cache (WEdit *edit, off_t range_start, off_t range_end) +{ + TSTree *tree; + TSInput input; + gboolean had_edit; + + if (!edit->ts.active) + return; + + input.payload = edit; + input.read = ts_input_read; + input.encoding = TSInputEncodingUTF8; + + /* Remember if this rebuild was triggered by an edit (vs scroll). + During rapid editing, we skip the expensive injection processing + to keep the render responsive. Injections are only refreshed when + rebuilding due to a scroll (cache invalidated by viewport change). */ + had_edit = edit->ts.need_reparse; + + // Perform deferred re-parse if the tree was edited since last parse + if (edit->ts.need_reparse) + { + TSTree *new_tree; + + new_tree = + ts_parser_parse ((TSParser *) edit->ts.parser, (TSTree *) edit->ts.tree, input); + if (new_tree != NULL) + { + ts_tree_delete ((TSTree *) edit->ts.tree); + edit->ts.tree = new_tree; + } + + edit->ts.need_reparse = FALSE; + } + + tree = (TSTree *) edit->ts.tree; + + g_array_set_size (edit->ts.highlights, 0); + + // Run the primary highlight query + ts_run_query_into_highlights ((TSQuery *) edit->ts.highlight_query, tree, + (uint32_t) range_start, (uint32_t) range_end, + edit->ts.highlights, edit->ts.grammar_name, edit); + + /* Run injection queries if configured. Skip during rapid edits to keep + rendering responsive - injections are expensive (parse + query for each + injected range). Edits that don't change injection structure won't lose + much; the next idle rebuild will refresh injections. */ + if (edit->ts.injection_query != NULL && (!had_edit || is_idle ())) + { + TSNode root; + TSQuery *inj_query; + TSQueryCursor *inj_cursor; + TSQueryMatch match; + + root = ts_tree_root_node (tree); + inj_query = (TSQuery *) edit->ts.injection_query; + + inj_cursor = ts_query_cursor_new (); + ts_query_cursor_set_byte_range (inj_cursor, (uint32_t) range_start, + (uint32_t) range_end); + ts_query_cursor_exec (inj_cursor, inj_query, root); + + while (ts_query_cursor_next_match (inj_cursor, &match)) + { + TSNode content_node = { .id = NULL }; + TSNode lang_node = { .id = NULL }; + const char *static_lang = NULL; + uint32_t ci; + + if (!ts_evaluate_match_predicates (inj_query, &match, edit)) + continue; + + for (ci = 0; ci < match.capture_count; ci++) + { + uint32_t name_len; + const char *cap_name; + + cap_name = + ts_query_capture_name_for_id (inj_query, match.captures[ci].index, &name_len); + + if (strcmp (cap_name, "injection.content") == 0) + content_node = match.captures[ci].node; + else if (strcmp (cap_name, "injection.language") == 0) + lang_node = match.captures[ci].node; + } + + if (ts_node_is_null (content_node)) + continue; + + static_lang = ts_get_set_predicate (inj_query, match.pattern_index, + "injection.language"); + + if (static_lang != NULL) + { + ts_inject_and_highlight (static_lang, content_node, input, + (uint32_t) range_start, (uint32_t) range_end, + edit->ts.highlights, edit->ts.injection_lang_cache, + edit, TS_MAX_INJECTION_DEPTH); + } + else if (!ts_node_is_null (lang_node)) + { + uint32_t lang_start = ts_node_start_byte (lang_node); + uint32_t lang_end = ts_node_end_byte (lang_node); + uint32_t lang_len = lang_end - lang_start; + + if (lang_len > 0 && lang_len < 64) + { + char lang_buf[64]; + uint32_t li; + char *s, *e; + + for (li = 0; li < lang_len; li++) + lang_buf[li] = + (char) edit_buffer_get_byte (&edit->buffer, + (off_t) (lang_start + li)); + lang_buf[lang_len] = '\0'; + + s = lang_buf; + while (*s == ' ' || *s == '\t') + s++; + e = s + strlen (s); + while (e > s + && (e[-1] == ' ' || e[-1] == '\t' || e[-1] == '\n' || e[-1] == '\r')) + e--; + *e = '\0'; + + if (*s != '\0') + ts_inject_and_highlight (s, content_node, input, + (uint32_t) range_start, (uint32_t) range_end, + edit->ts.highlights, + edit->ts.injection_lang_cache, + edit, TS_MAX_INJECTION_DEPTH); + } + } + } + + ts_query_cursor_delete (inj_cursor); + } + + /* Collect ERROR nodes and highlight them in red. ERROR entries are appended + last so that valid captures (which are narrower) take precedence via the + "narrower wins" rule in ts_get_color_at(). */ + if (ts_error_color >= 0) + { + TSNode root; + + root = ts_tree_root_node (tree); + + /* Only collect ERROR highlights if the root is NOT itself an ERROR node. + When the root is ERROR (e.g. macro-heavy C files like sqlite3.c), the + entire file would be painted red, which is unhelpful. In that case, + let the highlight query captures show whatever they can and leave + uncaptured regions as DEFAULT. */ + if (!ts_node_is_error (root)) + { + ts_collect_error_highlights (root, (uint32_t) range_start, + (uint32_t) range_end, edit->ts.highlights); + + /* If the tree root does not cover the full visible range (e.g. the + parser gave up early), color the uncovered tail as error too. */ + if (ts_node_end_byte (root) < (uint32_t) range_end) + { + ts_highlight_entry_t entry; + uint32_t gap_start = ts_node_end_byte (root); + + if (gap_start < (uint32_t) range_start) + gap_start = (uint32_t) range_start; + + entry.start_byte = gap_start; + entry.end_byte = (uint32_t) range_end; + entry.color = ts_error_color; + g_array_append_val (edit->ts.highlights, entry); + } + } + } + + edit->ts.highlights_start = range_start; + edit->ts.highlights_end = range_end; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up the color for a byte index in the highlight cache. + * Returns the color of the most specific (last) matching entry, + * since tree-sitter queries return matches in order from general to specific. + */ +int +ts_get_color_at (WEdit *edit, off_t byte_index) +{ + guint i; + int color = EDITOR_NORMAL_COLOR; + uint32_t match_start = 0; + uint32_t match_end = UINT32_MAX; + + if (edit->ts.highlights == NULL) + return EDITOR_NORMAL_COLOR; + + for (i = 0; i < edit->ts.highlights->len; i++) + { + ts_highlight_entry_t *e; + + e = &g_array_index (edit->ts.highlights, ts_highlight_entry_t, i); + + if ((off_t) e->start_byte <= byte_index && byte_index < (off_t) e->end_byte) + { + /* For overlapping ranges, prefer the narrower (more specific) one. + For equal ranges, keep the first match (more specific pattern). */ + uint32_t width = e->end_byte - e->start_byte; + uint32_t cur_width = match_end - match_start; + + if (color == EDITOR_NORMAL_COLOR || width < cur_width) + { + color = e->color; + match_start = e->start_byte; + match_end = e->end_byte; + } + } + } + + return color; +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Notify tree-sitter that the buffer was edited. + * Called after insert/delete operations. + * + * Only records the edit in the tree (ts_tree_edit) and marks the tree as needing + * re-parse. The actual re-parse is deferred to ts_rebuild_highlight_cache() so + * that bulk operations (block delete + insert) don't re-parse on every character. + */ +void +edit_syntax_ts_notify_edit (WEdit *edit, off_t start_byte, off_t old_end_byte, + off_t new_end_byte) +{ + TSInputEdit ts_edit; + + if (!edit->ts.active || edit->ts.tree == NULL || edit->ts.parser == NULL) + return; + + ts_edit.start_byte = (uint32_t) start_byte; + ts_edit.old_end_byte = (uint32_t) old_end_byte; + ts_edit.new_end_byte = (uint32_t) new_end_byte; + ts_edit.start_point = (TSPoint){ 0, 0 }; + ts_edit.old_end_point = (TSPoint){ 0, 0 }; + ts_edit.new_end_point = (TSPoint){ 0, 0 }; + + ts_tree_edit ((TSTree *) edit->ts.tree, &ts_edit); + + // Mark tree as needing re-parse; defer actual parsing to cache rebuild + edit->ts.need_reparse = TRUE; + + // Invalidate highlight cache + edit->ts.highlights_start = -1; + edit->ts.highlights_end = -1; +} + +#endif /* HAVE_TREE_SITTER */ diff --git a/src/editor/syntax_ts.h b/src/editor/syntax_ts.h new file mode 100644 index 0000000000..1ab20d58c4 --- /dev/null +++ b/src/editor/syntax_ts.h @@ -0,0 +1,32 @@ +/* + Tree-sitter syntax highlighting integration - public API. + */ + +/** \file syntax_ts.h + * \brief Header: tree-sitter syntax highlighting for the editor + */ + +#ifndef MC__EDIT_SYNTAX_TS_H +#define MC__EDIT_SYNTAX_TS_H + +#ifdef HAVE_TREE_SITTER + +/*** global variables defined in .c file ********************************************************/ + +/*** declarations of public functions ***********************************************************/ + +gboolean ts_init_for_file (WEdit *edit, const char *forced_grammar); +char *ts_config_reverse_lookup (const char *config_name, const char *display_value); +void ts_load_grammar_registry (void); /* exposed for mc-syntax-dump */ +void ts_free (WEdit *edit); +int ts_get_color_at (WEdit *edit, off_t byte_index); +void ts_rebuild_highlight_cache (WEdit *edit, off_t range_start, off_t range_end); + +/* Functions from syntax.c needed by the tree-sitter module */ +size_t read_one_line (char **line, FILE *f); +int this_try_alloc_color_pair (tty_color_pair_t *color); +const char *get_first_editor_line (WEdit *edit); + +#endif /* HAVE_TREE_SITTER */ + +#endif /* MC__EDIT_SYNTAX_TS_H */ diff --git a/src/editor/ts-grammar-loader.h b/src/editor/ts-grammar-loader.h new file mode 100644 index 0000000000..0ec46ff411 --- /dev/null +++ b/src/editor/ts-grammar-loader.h @@ -0,0 +1,142 @@ +/* + Runtime loader for tree-sitter grammar shared modules. + + Used in shared mode (TREE_SITTER_SHARED): grammars are loaded on demand + via g_module_open() from TS_GRAMMAR_LIBDIR (e.g. /usr/lib/mc/ts-grammars/). + + Provides ts_grammar_registry_lookup() with the same interface as the static + registry in ts-grammars/ts-grammar-registry.h, so syntax.c works unchanged. + */ + +#ifndef MC__TS_GRAMMAR_LOADER_H +#define MC__TS_GRAMMAR_LOADER_H + +#include +#include + +/* Implemented in syntax_ts.c -- reads grammar config */ +char *ts_get_symbol_override (const char *grammar_name); + +/*** Cached loaded modules ***/ + +typedef struct +{ + GModule *module; + const TSLanguage *(*func) (void); +} ts_loaded_grammar_t; + +static GHashTable *ts_loaded_grammars = NULL; + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Look up a grammar by name: load the shared module on first use, cache the result. + * Returns the TSLanguage* or NULL if the grammar module is not installed. + */ +static inline const TSLanguage * +ts_grammar_registry_lookup (const char *name) +{ + ts_loaded_grammar_t *entry; + char *module_path; + GModule *module = NULL; + char *symbol_name; + gpointer symbol; + + if (ts_loaded_grammars == NULL) + ts_loaded_grammars = g_hash_table_new (g_str_hash, g_str_equal); + + /* Check cache */ + entry = (ts_loaded_grammar_t *) g_hash_table_lookup (ts_loaded_grammars, name); + if (entry != NULL) + return entry->func != NULL ? entry->func () : NULL; + + /* Try to load the module. Module files are named .so/.dylib/.dll. + g_module_open() appends the platform suffix automatically when omitted. + Check user-local path first (~/.local/lib/mc/ts-grammars/), + then the system path (TS_GRAMMAR_LIBDIR). */ + { + const char *home = g_get_home_dir (); + + if (home != NULL) + { + module_path = g_strdup_printf ("%s/.local/lib/mc/ts-grammars/%s", home, name); + module = g_module_open (module_path, G_MODULE_BIND_LAZY); + g_free (module_path); + } + } + + if (module == NULL) + { + module_path = g_strdup_printf ("%s/%s", TS_GRAMMAR_LIBDIR, name); + module = g_module_open (module_path, G_MODULE_BIND_LAZY); + g_free (module_path); + } + + if (module == NULL) + { + /* Not found — cache a NULL entry so we don't retry */ + entry = g_new0 (ts_loaded_grammar_t, 1); + g_hash_table_insert (ts_loaded_grammars, g_strdup (name), entry); + return NULL; + } + + /* Determine the symbol name from config.ini. + Default is tree_sitter_. Override if config has symbol= entry. */ + { + char *override = ts_get_symbol_override (name); + + if (override != NULL) + { + symbol_name = g_strdup_printf ("tree_sitter_%s", override); + g_free (override); + } + else + symbol_name = g_strdup_printf ("tree_sitter_%s", name); + } + + if (!g_module_symbol (module, symbol_name, &symbol)) + { + g_free (symbol_name); + g_module_close (module); + entry = g_new0 (ts_loaded_grammar_t, 1); + g_hash_table_insert (ts_loaded_grammars, g_strdup (name), entry); + return NULL; + } + g_free (symbol_name); + + /* Cache the successful result */ + entry = g_new0 (ts_loaded_grammar_t, 1); + entry->module = module; + entry->func = (const TSLanguage * (*) (void)) symbol; + g_hash_table_insert (ts_loaded_grammars, g_strdup (name), entry); + + return entry->func (); +} + +/* --------------------------------------------------------------------------------------------- */ + +/** + * Clean up loaded grammar modules at shutdown. + * We intentionally do NOT close the modules because TSLanguage pointers + * may still be referenced by parsers. The OS reclaims them at exit. + */ +static inline void +ts_grammar_modules_cleanup (void) +{ + if (ts_loaded_grammars != NULL) + { + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init (&iter, ts_loaded_grammars); + while (g_hash_table_iter_next (&iter, &key, &value)) + { + g_free (key); + g_free (value); + } + g_hash_table_destroy (ts_loaded_grammars); + ts_loaded_grammars = NULL; + } +} + +#endif /* MC__TS_GRAMMAR_LOADER_H */ diff --git a/src/setup.c b/src/setup.c index aaa32bd6ad..a390d4e145 100644 --- a/src/setup.c +++ b/src/setup.c @@ -355,6 +355,9 @@ static const struct { "editor_show_right_margin", &edit_options.show_right_margin }, { "editor_group_undo", &edit_options.group_undo }, { "editor_state_full_filename", &edit_options.state_full_filename }, +#ifdef HAVE_TREE_SITTER + { "editor_use_tree_sitter", &edit_options.use_tree_sitter }, +#endif #endif { "editor_ask_filename_before_edit", &editor_ask_filename_before_edit }, { "nice_rotating_dash", &nice_rotating_dash }, diff --git a/tests/.gitignore b/tests/.gitignore index 53aac0c4be..64c077bea2 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -50,21 +50,30 @@ lib/strutil/replace__str_replace_all lib/strutil/replace__str_replace_all.log lib/strutil/replace__str_replace_all.trs lib/strutil/str_replace_all +lib/strutil/str_replace_all.log lib/strutil/str_rstrip_eol +lib/strutil/str_rstrip_eol.log lib/strutil/str_verscmp lib/strutil/str_verscmp.log lib/strutil/str_verscmp.trs lib/strutil/test-suite.log lib/terminal +lib/terminal.log lib/test-suite.log lib/tty +lib/tty.log lib/utilinux__my_system-fork_child.log lib/utilinux__my_system-fork_child_shell.log lib/utilunix__mc_pstream_get_string +lib/utilunix__mc_pstream_get_string.log lib/widget/group_init_destroy +lib/widget/group_init_destroy.log lib/widget/hotkey_equal +lib/widget/hotkey_equal.log lib/widget/widget_find_by_id +lib/widget/widget_find_by_id.log lib/widget/widget_make_global_local +lib/widget/widget_make_global_local.log lib/utilinux__my_system-fork_fail.log lib/utilunix__my_system_fork_child lib/utilunix__my_system_fork_child.log @@ -143,7 +152,12 @@ src/editor/editcmd__edit_complete_word_cmd src/editor/editcmd__edit_complete_word_cmd.log src/editor/editcmd__edit_complete_word_cmd.trs src/editor/edit_complete_word_cmd +src/editor/edit_insert_column_of_text +src/editor/edit_insert_column_of_text.log src/editor/edit_replace_cmd +src/editor/edit_replace_cmd.log +src/editor/edit_syntax_ts +src/editor/edit_syntax_ts.log src/editor/test-suite.log src/execute__execute_external_editor_or_viewer src/execute__execute_external_editor_or_viewer.log @@ -154,7 +168,10 @@ src/execute__execute_get_external_cmd_opts_from_config.trs src/execute__execute_with_vfs_arg src/execute__execute_with_vfs_arg.log src/execute__execute_with_vfs_arg.trs +src/file_history +src/file_history.log src/filemanager/cd_to +src/filemanager/cd_to.log src/filemanager/do_cd_command src/filemanager/do_cd_command.log src/filemanager/do_cd_command.trs @@ -172,10 +189,16 @@ src/filemanager/get_random_hint.log src/filemanager/get_random_hint.trs src/filemanager/test-suite.log src/test-suite.log +src/usermenu__test_condition +src/usermenu__test_condition.log +src/usermenu__test_expand_format +src/usermenu__test_expand_format.log src/vfs/extfs/helpers-list/mc_parse_ls_l src/vfs/extfs/helpers-list/run src/vfs/extfs/helpers-list/run.log src/vfs/extfs/helpers-list/run.trs src/vfs/extfs/helpers-list/test-suite.log src/vfs/ftpfs/ftpfs_parse_long_list +src/vfs/ftpfs/ftpfs_parse_long_list.log +src/vfs/ftpfs/test-suite.log *.trs diff --git a/tests/src/editor/Makefile.am b/tests/src/editor/Makefile.am index 81f3d9ae37..2e52164eb4 100644 --- a/tests/src/editor/Makefile.am +++ b/tests/src/editor/Makefile.am @@ -21,6 +21,10 @@ TESTS = \ edit_insert_column_of_text \ edit_replace_cmd +if USE_TREE_SITTER +TESTS += edit_syntax_ts +endif + check_PROGRAMS = $(TESTS) edit_complete_word_cmd_SOURCES = \ @@ -32,3 +36,21 @@ edit_insert_column_of_text_SOURCES = \ edit_replace_cmd_SOURCES = \ edit_replace_cmd.c +if USE_TREE_SITTER +edit_syntax_ts_SOURCES = \ + edit_syntax_ts.c + +edit_syntax_ts_CPPFLAGS = \ + -DTS_GRAMMAR_LIBDIR=\"$(libdir)/mc/ts-grammars\" \ + $(GLIB_CFLAGS) \ + $(TREE_SITTER_CFLAGS) \ + $(GMODULE_CFLAGS) \ + -I$(top_srcdir) \ + @CHECK_CFLAGS@ + +edit_syntax_ts_LDADD = \ + $(TREE_SITTER_LIBS) \ + $(GMODULE_LIBS) \ + $(GLIB_LIBS) \ + @CHECK_LIBS@ +endif diff --git a/tests/src/editor/edit_syntax_ts.c b/tests/src/editor/edit_syntax_ts.c new file mode 100644 index 0000000000..5979dd9095 --- /dev/null +++ b/tests/src/editor/edit_syntax_ts.c @@ -0,0 +1,663 @@ +/* + src/editor - tests for tree-sitter syntax highlighting integration + + Copyright (C) 2026 + Free Software Foundation, Inc. + + This file is part of the Midnight Commander. + + The Midnight Commander is free software: you can redistribute it + and/or modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + The Midnight Commander is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#define TEST_SUITE_NAME "/src/editor" + +#include "tests/mctest.h" + +#include +#include + +#include "src/editor/ts-grammar-loader.h" + +/* --------------------------------------------------------------------------------------------- */ + +/* Path to the query files directory (set via -DTEST_TS_QUERIES_DIR) */ +#ifndef TEST_TS_QUERIES_DIR +#error "TEST_TS_QUERIES_DIR must be defined" +#endif + +/** + * Find a query file. Checks source tree first, then user's installed location. + * Returns TRUE and fills path on success, FALSE if not found. + */ +static gboolean +test_find_query_file (char *path, size_t path_size, const char *filename) +{ + snprintf (path, path_size, "%s/%s", TEST_TS_QUERIES_DIR, filename); + if (g_file_test (path, G_FILE_TEST_IS_REGULAR)) + return TRUE; + + /* Fall back to user's installed queries */ + { + const char *home = g_get_home_dir (); + if (home != NULL) + { + snprintf (path, path_size, "%s/.local/share/mc/syntax-ts/queries/%s", home, filename); + if (g_file_test (path, G_FILE_TEST_IS_REGULAR)) + return TRUE; + } + } + + return FALSE; +} + +/* In shared mode, HAVE_GRAMMAR_* macros are not defined. + Define them all to 1 so tests compile unconditionally — runtime lookup + via ts_grammar_registry_lookup() handles missing grammars gracefully. */ +#ifdef TREE_SITTER_SHARED +#define HAVE_GRAMMAR_C 1 +#define HAVE_GRAMMAR_PYTHON 1 +#define HAVE_GRAMMAR_BASH 1 +#define HAVE_GRAMMAR_MARKDOWN 1 +#define HAVE_GRAMMAR_MARKDOWN_INLINE 1 +#define HAVE_GRAMMAR_HTML 1 +#define HAVE_GRAMMAR_JAVASCRIPT 1 +#define HAVE_GRAMMAR_CSS 1 +#endif + +/* --------------------------------------------------------------------------------------------- */ + +/* @Before */ +static void +setup (void) +{ +} + +/* --------------------------------------------------------------------------------------------- */ + +/* @After */ +static void +teardown (void) +{ +} + +/* --------------------------------------------------------------------------------------------- */ +/* Test 1: Grammar registry lookup returns non-NULL for known grammars */ + +static const struct test_registry_lookup_found_ds +{ + const char *grammar_name; +} test_registry_lookup_found_ds[] = { + { "c" }, + { "python" }, + { "bash" }, +}; + +/* @Test(dataSource = "test_registry_lookup_found_ds") */ +START_PARAMETRIZED_TEST (test_registry_lookup_found, test_registry_lookup_found_ds) +{ + // when + const TSLanguage *lang = ts_grammar_registry_lookup (data->grammar_name); + + // then + ck_assert_msg (lang != NULL, "Grammar '%s' should be found in registry", data->grammar_name); +} +END_PARAMETRIZED_TEST + +/* --------------------------------------------------------------------------------------------- */ +/* Test 2: Grammar registry lookup returns NULL for unknown grammars */ + +START_TEST (test_registry_lookup_not_found) +{ + // when + const TSLanguage *lang = ts_grammar_registry_lookup ("nonexistent_language_xyz"); + + // then + mctest_assert_null (lang); +} +END_TEST + +/* --------------------------------------------------------------------------------------------- */ +/* Test 3: Every available grammar has a valid query file that compiles without errors. + This is the most important test -- it catches the silent failure mode where an invalid + node name in a .scm file causes ts_query_new() to reject the entire query. + + In static mode: iterates the compile-time registry. + In shared mode: scans the query directory for *-highlights.scm files. */ + +/** + * Try to compile a query for a grammar. Returns: 0=success, 1=failure, -1=skip. + */ +static int +test_one_query (const char *name, const TSLanguage *lang) +{ + char path[1024]; + char *src = NULL; + gsize len = 0; + uint32_t eo = 0; + TSQueryError et = TSQueryErrorNone; + TSQuery *q; + + if (lang == NULL || ts_language_version (lang) < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION) + return -1; + + { + char filename[256]; + snprintf (filename, sizeof (filename), "%s-highlights.scm", name); + if (!test_find_query_file (path, sizeof (path), filename)) + return -1; /* skip, query file not available */ + } + if (!g_file_get_contents (path, &src, &len, NULL)) + return -1; + + q = ts_query_new (lang, src, (uint32_t) len, &eo, &et); + g_free (src); + + if (q == NULL) + return 1; + + ts_query_delete (q); + return 0; +} + +START_TEST (test_all_query_files_compile) +{ + int tested = 0; + int failed = 0; + char first_fail[128] = ""; + + { + /* Scan per-grammar directories for highlights.scm */ + char user_path[1024]; + const char *dirs[2]; + int d; + + snprintf (user_path, sizeof (user_path), "%s/.local/share/mc/syntax-ts", + g_get_home_dir ()); + dirs[0] = user_path; + dirs[1] = "/usr/share/mc/syntax-ts"; + + for (d = 0; d < 2; d++) + { + GDir *dir = g_dir_open (dirs[d], 0, NULL); + const gchar *entry; + + if (dir == NULL) + continue; + + while ((entry = g_dir_read_name (dir)) != NULL) + { + gchar *scm_path; + + scm_path = g_build_filename (dirs[d], entry, "highlights.scm", NULL); + if (g_file_test (scm_path, G_FILE_TEST_IS_REGULAR)) + { + const TSLanguage *lang = ts_grammar_registry_lookup (entry); + int rc = test_one_query (entry, lang); + + if (rc >= 0) + { + tested++; + if (rc == 1 && ++failed == 1) + snprintf (first_fail, sizeof (first_fail), "%s", entry); + } + } + g_free (scm_path); + } + g_dir_close (dir); + + if (tested > 0) + break; + } + } + + ck_assert_msg (tested > 0, "No grammars found"); + ck_assert_msg (failed == 0, "Query failed for %d/%d (first: %s)", failed, tested, first_fail); +} +END_TEST + +/* --------------------------------------------------------------------------------------------- */ +/* Test 4: Verify parser creation and basic parse works for a few grammars */ + +#ifdef HAVE_GRAMMAR_C +START_TEST (test_parser_basic_parse) +{ + const char *test_code = "int main(void) { return 0; }\n"; + const TSLanguage *lang; + TSParser *parser; + TSTree *tree; + + lang = ts_grammar_registry_lookup ("c"); + ck_assert_msg (lang != NULL, "C grammar must exist"); + + parser = ts_parser_new (); + ck_assert_msg (parser != NULL, "ts_parser_new() must succeed"); + + ck_assert_msg (ts_parser_set_language (parser, lang), "ts_parser_set_language() must succeed"); + + tree = ts_parser_parse_string (parser, NULL, test_code, (uint32_t) strlen (test_code)); + ck_assert_msg (tree != NULL, "ts_parser_parse_string() must return a tree"); + + // Root node should not be error + TSNode root = ts_tree_root_node (tree); + ck_assert_msg (!ts_node_is_null (root), "Root node must not be null"); + ck_assert_msg (ts_node_child_count (root) > 0, "Root node must have children"); + + ts_tree_delete (tree); + ts_parser_delete (parser); +} +END_TEST +#endif + +/* --------------------------------------------------------------------------------------------- */ +/* Test 5: Query cursor produces captures for C code */ + +#ifdef HAVE_GRAMMAR_C +START_TEST (test_query_captures_c) +{ + const char *test_code = "// comment\nint main(void) { return 0; }\n"; + const TSLanguage *lang; + TSParser *parser; + TSTree *tree; + char query_path[1024]; + char *query_src = NULL; + gsize query_len = 0; + uint32_t error_offset = 0; + TSQueryError error_type = TSQueryErrorNone; + TSQuery *query; + TSQueryCursor *cursor; + TSQueryMatch match; + int capture_count = 0; + + lang = ts_grammar_registry_lookup ("c"); + ck_assert_msg (lang != NULL, "C grammar must exist"); + + // Parse test code + parser = ts_parser_new (); + ts_parser_set_language (parser, lang); + tree = ts_parser_parse_string (parser, NULL, test_code, (uint32_t) strlen (test_code)); + ck_assert_msg (tree != NULL, "Parse must succeed"); + + // Load and compile query + ck_assert_msg (test_find_query_file (query_path, sizeof (query_path), "c-highlights.scm"), + "c-highlights.scm must be readable"); + ck_assert_msg (g_file_get_contents (query_path, &query_src, &query_len, NULL), + "c-highlights.scm must be readable"); + + query = ts_query_new (lang, query_src, (uint32_t) query_len, &error_offset, &error_type); + ck_assert_msg (query != NULL, "C query must compile (error type %d at offset %u)", + (int) error_type, error_offset); + + // Run query cursor + cursor = ts_query_cursor_new (); + ts_query_cursor_exec (cursor, query, ts_tree_root_node (tree)); + + while (ts_query_cursor_next_match (cursor, &match)) + { + for (uint16_t i = 0; i < match.capture_count; i++) + { + capture_count++; + } + } + + // We expect at least some captures (comment, keyword "int", "return", number "0", etc.) + ck_assert_msg (capture_count >= 3, + "Expected at least 3 captures for C test code, got %d", capture_count); + + ts_query_cursor_delete (cursor); + ts_query_delete (query); + g_free (query_src); + ts_tree_delete (tree); + ts_parser_delete (parser); +} +END_TEST +#endif + +/* --------------------------------------------------------------------------------------------- */ +/* Test 6: Markdown inline injection - parse with included ranges and get captures */ + +#if defined(HAVE_GRAMMAR_MARKDOWN) && defined(HAVE_GRAMMAR_MARKDOWN_INLINE) +START_TEST (test_markdown_inline_injection) +{ + const char *test_md = "# Hello\n\nThis is **bold** and `code` text.\n"; + const TSLanguage *block_lang; + const TSLanguage *inline_lang; + TSParser *block_parser; + TSParser *inline_parser; + TSTree *block_tree; + TSTree *inline_tree; + TSNode root; + uint32_t child_count, i; + GArray *ranges; + char query_path[1024]; + char *query_src = NULL; + gsize query_len = 0; + uint32_t error_offset = 0; + TSQueryError error_type = TSQueryErrorNone; + TSQuery *query; + TSQueryCursor *cursor; + TSQueryMatch match; + int capture_count = 0; + + block_lang = ts_grammar_registry_lookup ("markdown"); + ck_assert_msg (block_lang != NULL, "markdown grammar must exist"); + + inline_lang = ts_grammar_registry_lookup ("markdown_inline"); + ck_assert_msg (inline_lang != NULL, "markdown_inline grammar must exist"); + + // Parse with block parser + block_parser = ts_parser_new (); + ts_parser_set_language (block_parser, block_lang); + block_tree = ts_parser_parse_string (block_parser, NULL, test_md, (uint32_t) strlen (test_md)); + ck_assert_msg (block_tree != NULL, "Block parse must succeed"); + + // Recursively collect "inline" node ranges from the block tree + ranges = g_array_new (FALSE, FALSE, sizeof (TSRange)); + root = ts_tree_root_node (block_tree); + { + // Use a simple stack-based DFS to find all "inline" nodes + GArray *stack = g_array_new (FALSE, FALSE, sizeof (TSNode)); + g_array_append_val (stack, root); + + while (stack->len > 0) + { + TSNode node = g_array_index (stack, TSNode, stack->len - 1); + const char *type; + + g_array_set_size (stack, stack->len - 1); + type = ts_node_type (node); + + if (strcmp (type, "inline") == 0) + { + TSRange r; + + r.start_point = ts_node_start_point (node); + r.end_point = ts_node_end_point (node); + r.start_byte = ts_node_start_byte (node); + r.end_byte = ts_node_end_byte (node); + g_array_append_val (ranges, r); + } + else + { + child_count = ts_node_child_count (node); + for (i = 0; i < child_count; i++) + { + TSNode child = ts_node_child (node, i); + g_array_append_val (stack, child); + } + } + } + + g_array_free (stack, TRUE); + } + + ck_assert_msg (ranges->len > 0, "Should find at least one 'inline' node in markdown block tree"); + + // Set up inline parser with included ranges + inline_parser = ts_parser_new (); + ts_parser_set_language (inline_parser, inline_lang); + ts_parser_set_included_ranges (inline_parser, &g_array_index (ranges, TSRange, 0), ranges->len); + + inline_tree = ts_parser_parse_string (inline_parser, NULL, test_md, (uint32_t) strlen (test_md)); + ck_assert_msg (inline_tree != NULL, "Inline parse must succeed"); + + // Load and compile inline query + ck_assert_msg (test_find_query_file (query_path, sizeof (query_path), + "markdown_inline-highlights.scm"), + "markdown_inline-highlights.scm must be readable"); + ck_assert_msg (g_file_get_contents (query_path, &query_src, &query_len, NULL), + "markdown_inline-highlights.scm must be readable"); + + query = ts_query_new (inline_lang, query_src, (uint32_t) query_len, &error_offset, &error_type); + ck_assert_msg (query != NULL, + "markdown_inline query must compile (error type %d at offset %u)", + (int) error_type, error_offset); + + // Run query cursor + cursor = ts_query_cursor_new (); + ts_query_cursor_exec (cursor, query, ts_tree_root_node (inline_tree)); + + while (ts_query_cursor_next_match (cursor, &match)) + { + for (uint16_t mi = 0; mi < match.capture_count; mi++) + { + capture_count++; + } + } + + // We expect at least 2 captures: **bold** (strong_emphasis) and `code` (code_span) + ck_assert_msg (capture_count >= 2, + "Expected at least 2 captures for markdown inline test, got %d", capture_count); + + ts_query_cursor_delete (cursor); + ts_query_delete (query); + g_free (query_src); + g_array_free (ranges, TRUE); + ts_tree_delete (inline_tree); + ts_tree_delete (block_tree); + ts_parser_delete (inline_parser); + ts_parser_delete (block_parser); +} +END_TEST +#endif + +/* --------------------------------------------------------------------------------------------- */ +/* Test 7: HTML multi-injection - JS in \n" + "\n"; + const TSLanguage *html_lang; + TSParser *html_parser; + TSTree *html_tree; + TSNode root; + uint32_t child_count, i; + + html_lang = ts_grammar_registry_lookup ("html"); + ck_assert_msg (html_lang != NULL, "html grammar must exist"); + + html_parser = ts_parser_new (); + ts_parser_set_language (html_parser, html_lang); + html_tree = + ts_parser_parse_string (html_parser, NULL, test_html, (uint32_t) strlen (test_html)); + ck_assert_msg (html_tree != NULL, "HTML parse must succeed"); + + // Verify that script_element and style_element nodes exist in the tree + { + gboolean found_script = FALSE; + gboolean found_style = FALSE; + GArray *stack = g_array_new (FALSE, FALSE, sizeof (TSNode)); + + root = ts_tree_root_node (html_tree); + g_array_append_val (stack, root); + + while (stack->len > 0) + { + TSNode node = g_array_index (stack, TSNode, stack->len - 1); + const char *type; + + g_array_set_size (stack, stack->len - 1); + type = ts_node_type (node); + + if (strcmp (type, "script_element") == 0) + found_script = TRUE; + if (strcmp (type, "style_element") == 0) + found_style = TRUE; + + child_count = ts_node_child_count (node); + for (i = 0; i < child_count; i++) + { + TSNode child = ts_node_child (node, i); + g_array_append_val (stack, child); + } + } + + g_array_free (stack, TRUE); + + ck_assert_msg (found_script, "HTML tree must contain script_element"); + ck_assert_msg (found_style, "HTML tree must contain style_element"); + } + + // Parse the raw_text inside script_element with the JavaScript grammar + { + const TSLanguage *js_lang; + TSParser *js_parser; + char query_path[1024]; + char *query_src = NULL; + gsize query_len = 0; + uint32_t error_offset = 0; + TSQueryError error_type = TSQueryErrorNone; + TSQuery *query; + + js_lang = ts_grammar_registry_lookup ("javascript"); + ck_assert_msg (js_lang != NULL, "javascript grammar must exist"); + + // Find raw_text inside script_element + { + GArray *stack = g_array_new (FALSE, FALSE, sizeof (TSNode)); + + root = ts_tree_root_node (html_tree); + g_array_append_val (stack, root); + + while (stack->len > 0) + { + TSNode node = g_array_index (stack, TSNode, stack->len - 1); + const char *type; + + g_array_set_size (stack, stack->len - 1); + type = ts_node_type (node); + + if (strcmp (type, "script_element") == 0) + { + // Find raw_text child and parse it with JS + uint32_t cc = ts_node_child_count (node); + uint32_t ci; + + for (ci = 0; ci < cc; ci++) + { + TSNode child = ts_node_child (node, ci); + + if (strcmp (ts_node_type (child), "raw_text") == 0) + { + TSRange r; + TSTree *js_tree; + TSQueryCursor *cursor; + TSQueryMatch match; + int js_captures = 0; + + r.start_point = ts_node_start_point (child); + r.end_point = ts_node_end_point (child); + r.start_byte = ts_node_start_byte (child); + r.end_byte = ts_node_end_byte (child); + + js_parser = ts_parser_new (); + ts_parser_set_language (js_parser, js_lang); + ts_parser_set_included_ranges (js_parser, &r, 1); + + js_tree = ts_parser_parse_string (js_parser, NULL, test_html, + (uint32_t) strlen (test_html)); + ck_assert_msg (js_tree != NULL, "JS injection parse must succeed"); + + ck_assert_msg ( + test_find_query_file (query_path, sizeof (query_path), + "javascript-highlights.scm"), + "javascript-highlights.scm must be readable"); + ck_assert_msg ( + g_file_get_contents (query_path, &query_src, &query_len, NULL), + "javascript-highlights.scm must be readable"); + + query = ts_query_new (js_lang, query_src, (uint32_t) query_len, + &error_offset, &error_type); + ck_assert_msg (query != NULL, "JS query must compile"); + + cursor = ts_query_cursor_new (); + ts_query_cursor_exec (cursor, query, ts_tree_root_node (js_tree)); + + while (ts_query_cursor_next_match (cursor, &match)) + { + uint16_t mi; + + for (mi = 0; mi < match.capture_count; mi++) + js_captures++; + } + + ck_assert_msg (js_captures >= 1, + "Expected JS captures from script content, got %d", + js_captures); + + ts_query_cursor_delete (cursor); + ts_query_delete (query); + g_free (query_src); + ts_tree_delete (js_tree); + ts_parser_delete (js_parser); + } + } + break; + } + + child_count = ts_node_child_count (node); + for (i = 0; i < child_count; i++) + { + TSNode child = ts_node_child (node, i); + g_array_append_val (stack, child); + } + } + + g_array_free (stack, TRUE); + } + } + + ts_tree_delete (html_tree); + ts_parser_delete (html_parser); +} +END_TEST +#endif + +/* --------------------------------------------------------------------------------------------- */ + +int +main (void) +{ + TCase *tc_core; + + tc_core = tcase_create ("Core"); + + tcase_add_checked_fixture (tc_core, setup, teardown); + + // Add new tests here: *************** + if (sizeof (test_registry_lookup_found_ds) > 0) + mctest_add_parameterized_test (tc_core, test_registry_lookup_found, + test_registry_lookup_found_ds); + tcase_add_test (tc_core, test_registry_lookup_not_found); + tcase_add_test (tc_core, test_all_query_files_compile); +#ifdef HAVE_GRAMMAR_C + tcase_add_test (tc_core, test_parser_basic_parse); + tcase_add_test (tc_core, test_query_captures_c); +#endif +#if defined(HAVE_GRAMMAR_MARKDOWN) && defined(HAVE_GRAMMAR_MARKDOWN_INLINE) + tcase_add_test (tc_core, test_markdown_inline_injection); +#endif +#if defined(HAVE_GRAMMAR_HTML) && defined(HAVE_GRAMMAR_JAVASCRIPT) && defined(HAVE_GRAMMAR_CSS) + tcase_add_test (tc_core, test_html_multi_injection); +#endif + // *********************************** + + return mctest_run_all (tc_core); +} + +/* --------------------------------------------------------------------------------------------- */ diff --git a/tests/syntax/.gitignore b/tests/syntax/.gitignore new file mode 100644 index 0000000000..7c733ffe12 --- /dev/null +++ b/tests/syntax/.gitignore @@ -0,0 +1,2 @@ +mc-syntax-dump +.libs diff --git a/tests/syntax/README.md b/tests/syntax/README.md new file mode 100644 index 0000000000..1d8d42ba8c --- /dev/null +++ b/tests/syntax/README.md @@ -0,0 +1,61 @@ +# mc-syntax-dump + +Dump MC syntax highlighting as ANSI-colored text. + +Uses MC's actual syntax engine internals to produce the exact same +colors that mcedit would show for a given file. + +## Building + +From the MC build directory (after running `./configure` and `make`): + +```bash +make -C tests/syntax +``` + +This builds the `mc-syntax-dump` binary using the Makefile in +`tests/syntax/`. The tool links against MC's `libinternal.la` and +`libmc.la` to use the real `edit_get_syntax_color()` path. + +## Usage + +```text +tests/syntax/mc-syntax-dump [--ts|--legacy] +``` + +- `--ts` — force tree-sitter highlighting +- `--legacy` — force legacy regex-based highlighting + +If neither flag is given, tree-sitter is tried first and falls back +to legacy. The output is the source file with ANSI color escape +sequences on stdout. Diagnostic messages go to stderr. + +## Examples + +Compare legacy and tree-sitter highlighting for a bash script: + +```bash +tests/syntax/mc-syntax-dump --legacy tests/syntax/samples/bash.sh \ + > /tmp/legacy.txt +tests/syntax/mc-syntax-dump --ts tests/syntax/samples/bash.sh \ + > /tmp/ts.txt + +cat /tmp/legacy.txt # view legacy colors +cat /tmp/ts.txt # view tree-sitter colors +``` + +## Prerequisites + +- MC must be compiled with tree-sitter support (`--with-tree-sitter`) +- Grammar `.so` files must be installed in + `~/.local/lib/mc/ts-grammars/` +- Syntax files must be present in `~/.local/share/mc/syntax/` (copy + from `/usr/share/mc/syntax/` if needed) +- Query overrides and `colors.ini` in `~/.local/share/mc/syntax-ts/` + +## Sample files + +The `samples/` subdirectory contains sample source files that exercise +all syntax features of each language. These are used to compare legacy +and tree-sitter highlighting output. Each sample has a corresponding +`*-report.md` file documenting the comparison results. diff --git a/tests/syntax/mc-syntax-dump.c b/tests/syntax/mc-syntax-dump.c new file mode 100644 index 0000000000..9aa9237326 --- /dev/null +++ b/tests/syntax/mc-syntax-dump.c @@ -0,0 +1,665 @@ +/* + * mc-syntax-dump.c - Dump MC syntax highlighting as ANSI-colored text. + * + * Uses MC's actual syntax engine internals for exact color output. + * + * Usage: + * mc-syntax-dump [--ts|--legacy] + * mc-syntax-dump --ts --grammar-dir DIR --lib-dir DIR + * + * When --grammar-dir and --lib-dir are given, loads the grammar directly + * from those paths instead of using MC's installed grammar discovery. + * This allows testing queries from a development checkout without + * installing them. + * + * --grammar-dir DIR Path to a per-grammar directory containing + * highlights.scm, config.ini, and optionally + * injections.scm. + * --lib-dir DIR Path to directory containing .so files. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/global.h" +#include "lib/strutil.h" +#include "lib/fileloc.h" +#include "lib/skin.h" +#include "lib/tty/color.h" +#include "lib/tty/color-internal.h" +#include "lib/tty/tty.h" +#include "lib/vfs/vfs.h" +#include "lib/widget.h" +#include "lib/mcconfig.h" + +#include "src/vfs/local/local.c" + +#include "src/editor/editwidget.h" +#include "src/editor/edit-impl.h" +#ifdef HAVE_TREE_SITTER +#include "src/editor/syntax_ts.h" +#include +#include +#endif + +static void +crash_handler (int sig) +{ + void *bt[30]; + int n = backtrace (bt, 30); + fprintf (stderr, "Signal %d, backtrace:\n", sig); + backtrace_symbols_fd (bt, n, 2); + _exit (1); +} + +/* Mocks */ +void mc_refresh (void) {} +gboolean edit_load_macro_cmd (WEdit *_edit) { (void) _edit; return FALSE; } + +/* Wrap tty_color_init_lib and tty_color_try_alloc_lib_pair to avoid + needing a real terminal (SLang/ncurses). */ +void __wrap_tty_color_init_lib (gboolean disable, gboolean force) +{ + (void) disable; + (void) force; +} + +void __wrap_tty_color_try_alloc_lib_pair (tty_color_lib_pair_t *mc_color_pair) +{ + (void) mc_color_pair; +} + +void __wrap_tty_color_deinit_lib (void) +{ +} + +#ifdef HAVE_TREE_SITTER +/* Duplicated from syntax_ts.c (file-scope types not visible to us) */ +typedef struct +{ + uint32_t start_byte; + uint32_t end_byte; + int color; +} ts_highlight_entry_t; + +/* + * TSInput read callback: reads chunks of text from the edit buffer. + * Duplicated from syntax_ts.c because the original is static. + */ +static const char * +ts_input_read_cb (void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read) +{ + static char buf[4096]; + WEdit *edit_buf = (WEdit *) payload; + uint32_t i; + + (void) position; + + for (i = 0; i < sizeof (buf) && (off_t) (byte_index + i) < edit_buf->buffer.size; i++) + buf[i] = edit_buffer_get_byte (&edit_buf->buffer, (off_t) (byte_index + i)); + + *bytes_read = i; + return (i > 0) ? buf : NULL; +} +#endif + +/* Wrap ts_init_for_file so we can disable it for --legacy mode */ +static gboolean ts_init_disabled = FALSE; +extern gboolean __real_ts_init_for_file (WEdit *edit, const char *forced_grammar); + +gboolean __wrap_ts_init_for_file (WEdit *edit, const char *forced_grammar) +{ + if (ts_init_disabled) + return FALSE; + return __real_ts_init_for_file (edit, forced_grammar); +} + +/* ------------------------------------------------------------------ */ +/* Color pair reverse mapping */ +/* ------------------------------------------------------------------ */ + +static const char *ansi_codes[16] = { + "\033[30m", "\033[31m", "\033[32m", "\033[33m", + "\033[34m", "\033[35m", "\033[36m", "\033[37m", + "\033[90m", "\033[91m", "\033[92m", "\033[93m", + "\033[94m", "\033[95m", "\033[96m", "\033[97m", +}; + +static const char *color_names[16] = { + "black", "red", "green", "brown", + "blue", "magenta", "cyan", "lightgray", + "gray", "brightred", "brightgreen", "yellow", + "brightblue", "brightmagenta", "brightcyan", "white", +}; + +#define CACHE_SIZE 4096 +static struct { int pair_index; int fg_index; } pair_cache[CACHE_SIZE]; +static int pair_cache_count = 0; + +static const char * +pair_index_to_fg_ansi (int pair_idx) +{ + int i, fg, bg; + + if (pair_idx < 0) + return ""; + + for (i = 0; i < pair_cache_count; i++) + if (pair_cache[i].pair_index == pair_idx) + return ansi_codes[pair_cache[i].fg_index]; + + for (fg = 0; fg < 16; fg++) + { + for (bg = 0; bg < 17; bg++) + { + tty_color_pair_t cp; + int test_idx; + + cp.fg = (char *) color_names[fg]; + cp.bg = (bg < 16) ? (char *) color_names[bg] : NULL; + cp.attrs = NULL; + cp.pair_index = 0; + + test_idx = tty_try_alloc_color_pair (&cp, FALSE); + if (test_idx == pair_idx) + { + if (pair_cache_count < CACHE_SIZE) + { + pair_cache[pair_cache_count].pair_index = pair_idx; + pair_cache[pair_cache_count].fg_index = fg; + pair_cache_count++; + } + return ansi_codes[fg]; + } + } + } + + return ""; +} + +/* ------------------------------------------------------------------ */ +/* Direct grammar loading (--grammar-dir / --lib-dir) */ +/* ------------------------------------------------------------------ */ + +#ifdef HAVE_TREE_SITTER + +/* + * Convert config.ini [colors] section into a temporary colors.ini file + * in the MC format (section = grammar name, keys = capture names). + * This lets MC's existing ts_load_grammar_registry() pick up the colors + * without modifying the static ts_color_map in syntax_ts.c. + * + * The temporary file is written into a temp directory, and MC's + * data path is pointed there so ts_load_grammar_registry() finds it. + */ +static char * +create_temp_colors_ini (const char *config_path, const char *grammar_name) +{ + GKeyFile *kf; + gchar **keys; + gsize k_count, ki; + char *tmpdir; + char *ts_dir; + char *colors_path; + FILE *f; + + kf = g_key_file_new (); + if (!g_key_file_load_from_file (kf, config_path, G_KEY_FILE_NONE, NULL)) + { + g_key_file_free (kf); + return NULL; + } + + keys = g_key_file_get_keys (kf, "colors", &k_count, NULL); + if (keys == NULL) + { + g_key_file_free (kf); + return NULL; + } + + tmpdir = g_dir_make_tmp ("mc-syntax-dump-XXXXXX", NULL); + if (tmpdir == NULL) + { + g_strfreev (keys); + g_key_file_free (kf); + return NULL; + } + + ts_dir = g_build_filename (tmpdir, "syntax-ts", (char *) NULL); + g_mkdir_with_parents (ts_dir, 0700); + + colors_path = g_build_filename (ts_dir, "colors.ini", (char *) NULL); + f = fopen (colors_path, "w"); + g_free (colors_path); + g_free (ts_dir); + + if (f == NULL) + { + g_strfreev (keys); + g_key_file_free (kf); + g_free (tmpdir); + return NULL; + } + + fprintf (f, "[%s]\n", grammar_name); + for (ki = 0; ki < k_count; ki++) + { + gchar *value = g_key_file_get_value (kf, "colors", keys[ki], NULL); + if (value != NULL) + { + fprintf (f, "%s = %s\n", keys[ki], value); + g_free (value); + } + } + + fclose (f); + g_strfreev (keys); + g_key_file_free (kf); + + return tmpdir; +} + +/* + * Read the symbol= field from config.ini [grammar] section. + * Returns the symbol suffix or NULL if not specified. + * Caller must free. + */ +static char * +read_symbol_from_config (const char *config_path) +{ + GKeyFile *kf; + char *value; + + kf = g_key_file_new (); + if (!g_key_file_load_from_file (kf, config_path, G_KEY_FILE_NONE, NULL)) + { + g_key_file_free (kf); + return NULL; + } + + value = g_key_file_get_value (kf, "grammar", "symbol", NULL); + g_key_file_free (kf); + + if (value != NULL) + g_strstrip (value); + + return value; +} + +/* + * Initialize tree-sitter for an edit buffer by loading the grammar, + * query, and colors directly from the specified directories. + * Returns TRUE on success. + */ +static gboolean +ts_init_direct (WEdit *edit, const char *grammar_dir, const char *lib_dir) +{ + char *dir_basename; + char *grammar_name; + char *config_path; + char *highlights_path; + char *so_path; + char *symbol_override; + char *symbol_name; + char *query_src; + gsize query_len; + GModule *module; + gpointer symbol; + const TSLanguage *(*lang_func) (void); + const TSLanguage *lang; + TSParser *parser; + TSTree *tree; + TSInput input; + uint32_t error_offset; + TSQueryError error_type; + TSQuery *query; + + /* Grammar name from directory basename */ + dir_basename = g_path_get_basename (grammar_dir); + grammar_name = g_strdup (dir_basename); + g_free (dir_basename); + + config_path = g_build_filename (grammar_dir, "config.ini", (char *) NULL); + highlights_path = g_build_filename (grammar_dir, "highlights.scm", (char *) NULL); + + /* Load colors from our temp colors.ini */ + { + extern void ts_load_grammar_registry (void); + ts_load_grammar_registry (); + } + + /* Determine symbol name */ + symbol_override = read_symbol_from_config (config_path); + if (symbol_override != NULL) + { + symbol_name = g_strdup_printf ("tree_sitter_%s", symbol_override); + g_free (symbol_override); + } + else + symbol_name = g_strdup_printf ("tree_sitter_%s", grammar_name); + + /* Load .so from lib_dir */ + so_path = g_strdup_printf ("%s/%s", lib_dir, grammar_name); + module = g_module_open (so_path, G_MODULE_BIND_LAZY); + g_free (so_path); + + if (module == NULL) + { + fprintf (stderr, "Failed to load grammar module: %s\n", + g_module_error ()); + g_free (config_path); + g_free (highlights_path); + g_free (symbol_name); + g_free (grammar_name); + return FALSE; + } + + if (!g_module_symbol (module, symbol_name, &symbol)) + { + fprintf (stderr, "Symbol '%s' not found in module\n", symbol_name); + g_module_close (module); + g_free (config_path); + g_free (highlights_path); + g_free (symbol_name); + g_free (grammar_name); + return FALSE; + } + g_free (symbol_name); + + lang_func = (const TSLanguage * (*) (void)) symbol; + lang = lang_func (); + + /* Create parser */ + parser = ts_parser_new (); + if (!ts_parser_set_language (parser, lang)) + { + fprintf (stderr, "Failed to set parser language\n"); + ts_parser_delete (parser); + g_module_close (module); + g_free (config_path); + g_free (highlights_path); + g_free (grammar_name); + return FALSE; + } + + /* Parse the buffer */ + input.payload = edit; + input.read = ts_input_read_cb; + input.encoding = TSInputEncodingUTF8; + + tree = ts_parser_parse (parser, NULL, input); + if (tree == NULL) + { + fprintf (stderr, "Failed to parse file\n"); + ts_parser_delete (parser); + g_module_close (module); + g_free (config_path); + g_free (highlights_path); + g_free (grammar_name); + return FALSE; + } + + /* Load and compile highlight query */ + if (!g_file_get_contents (highlights_path, &query_src, &query_len, NULL)) + { + fprintf (stderr, "Failed to read %s\n", highlights_path); + ts_tree_delete (tree); + ts_parser_delete (parser); + g_module_close (module); + g_free (config_path); + g_free (highlights_path); + g_free (grammar_name); + return FALSE; + } + + query = ts_query_new (lang, query_src, (uint32_t) query_len, &error_offset, &error_type); + g_free (query_src); + + if (query == NULL) + { + fprintf (stderr, "Query compilation failed at offset %u (error type %d)\n", + error_offset, error_type); + ts_tree_delete (tree); + ts_parser_delete (parser); + g_module_close (module); + g_free (config_path); + g_free (highlights_path); + g_free (grammar_name); + return FALSE; + } + + /* Store in edit widget */ + edit->ts.parser = parser; + edit->ts.tree = tree; + edit->ts.highlight_query = query; + edit->ts.highlights = g_array_new (FALSE, FALSE, sizeof (ts_highlight_entry_t)); + edit->ts.highlights_start = -1; + edit->ts.highlights_end = -1; + edit->ts.grammar_name = grammar_name; + edit->ts.active = TRUE; + edit->ts.need_reparse = FALSE; + + g_free (config_path); + g_free (highlights_path); + + return TRUE; +} + +#endif /* HAVE_TREE_SITTER */ + +/* ------------------------------------------------------------------ */ +/* Main */ +/* ------------------------------------------------------------------ */ + +int +main (int argc, char *argv[]) +{ + const char *source_path = NULL; + const char *grammar_dir = NULL; + const char *lib_dir = NULL; + gboolean force_legacy = FALSE; + gboolean force_ts = FALSE; + int i; + WEdit *edit; + WRect rect; + edit_arg_t arg; + off_t byte_idx; + const char *reset = "\033[0m"; + static WGroup owner; + + signal (SIGSEGV, crash_handler); + signal (SIGABRT, crash_handler); + + setlocale (LC_ALL, ""); + + for (i = 1; i < argc; i++) + { + if (strcmp (argv[i], "--ts") == 0) + force_ts = TRUE; + else if (strcmp (argv[i], "--legacy") == 0) + force_legacy = TRUE; + else if (strcmp (argv[i], "--grammar-dir") == 0 && i + 1 < argc) + grammar_dir = argv[++i]; + else if (strcmp (argv[i], "--lib-dir") == 0 && i + 1 < argc) + lib_dir = argv[++i]; + else + source_path = argv[i]; + } + + if (source_path == NULL) + { + fprintf (stderr, + "Usage: %s [--ts|--legacy] [--grammar-dir DIR --lib-dir DIR] \n", + argv[0]); + return 1; + } + + /* --grammar-dir requires --lib-dir and implies --ts */ + if (grammar_dir != NULL) + { + if (lib_dir == NULL) + { + fprintf (stderr, "--grammar-dir requires --lib-dir\n"); + return 1; + } + force_ts = TRUE; + force_legacy = FALSE; + } + + str_init_strings (NULL); + mc_config_init_config_paths (NULL); + + vfs_init (); + vfs_init_localfs (); + vfs_setup_work_dir (); + + mc_global.share_data_dir = g_strdup ("/usr/share/mc"); + mc_global.sysconfig_dir = g_strdup ("/usr/share/mc"); + + tty_init_colors (FALSE, TRUE, 256); + use_colors = TRUE; + + edit_options.syntax_highlighting = TRUE; + edit_options.filesize_threshold = (char *) "64M"; + + rect_init (&rect, 0, 0, 24, 80); + memset (&arg, 0, sizeof (arg)); + arg.file_vpath = vfs_path_from_str (source_path); + arg.line_number = 0; + +#ifdef HAVE_TREE_SITTER + /* When using --grammar-dir, disable normal TS init during edit_init + so we can do our own direct init afterwards. */ + if (grammar_dir != NULL) + { + char *config_path; + char *dir_basename; + char *grammar_name; + char *tmpdir; + + ts_init_disabled = TRUE; + + /* Set up color config from config.ini BEFORE edit_init, + because ts_load_grammar_registry() runs once and caches. */ + dir_basename = g_path_get_basename (grammar_dir); + grammar_name = g_strdup (dir_basename); + g_free (dir_basename); + + config_path = g_build_filename (grammar_dir, "config.ini", (char *) NULL); + tmpdir = create_temp_colors_ini (config_path, grammar_name); + g_free (config_path); + g_free (grammar_name); + + if (tmpdir != NULL) + { + g_free (mc_global.share_data_dir); + mc_global.share_data_dir = tmpdir; + /* Also redirect user data path so the existing + ~/.local/share/mc/syntax-ts/colors.ini doesn't + override our temp colors. */ + setenv ("MC_XDG_DATA_HOME", tmpdir, 1); + /* Re-initialize paths with the new env */ + mc_config_init_config_paths (NULL); + } + } +#endif + + edit = edit_init (NULL, &rect, &arg); + if (edit == NULL) + { + fprintf (stderr, "Failed to open file: %s\n", source_path); + vfs_path_free (arg.file_vpath, TRUE); + return 1; + } + + memset (&owner, 0, sizeof (owner)); + group_add_widget (&owner, WIDGET (edit)); + +#ifdef HAVE_TREE_SITTER + if (grammar_dir != NULL) + { + /* Direct grammar loading from specified directories */ + ts_init_disabled = FALSE; + if (!ts_init_direct (edit, grammar_dir, lib_dir)) + { + fprintf (stderr, "Failed to initialize tree-sitter from %s\n", grammar_dir); + /* Fall through to show file without highlighting */ + } + } + else if (force_legacy && edit->ts.active) + { + ts_free (edit); + edit->ts.active = FALSE; + ts_init_disabled = TRUE; + edit_load_syntax (edit, NULL, NULL); + ts_init_disabled = FALSE; + } + else if (force_ts && !edit->ts.active) + fprintf (stderr, "Tree-sitter failed to initialize for this file\n"); +#else + (void) force_ts; + (void) force_legacy; + (void) grammar_dir; + (void) lib_dir; +#endif + + fprintf (stderr, "File: %s (%ld bytes)\n", source_path, (long) edit->buffer.size); + fprintf (stderr, "Rules: %p\n", (void *) edit->rules); + fprintf (stderr, "Mode: %s\n", +#ifdef HAVE_TREE_SITTER + edit->ts.active ? "tree-sitter" : "legacy" +#else + "legacy" +#endif + ); + + { + int prev_color = -1; + + for (byte_idx = 0; byte_idx < edit->buffer.size; byte_idx++) + { + int color; + char ch; + const char *ansi; + + color = edit_get_syntax_color (edit, byte_idx); + ch = (char) edit_buffer_get_byte (&edit->buffer, byte_idx); + + if (byte_idx < 12) + fprintf (stderr, " [%ld] '%c' color=%d\n", + (long) byte_idx, (ch >= 32 && ch < 127) ? ch : '.', color); + + if (color != prev_color) + { + if (prev_color >= 0 && prev_color < TTY_COLOR_MAP_OFFSET) + fputs (reset, stdout); + + if (color >= 0 && color < TTY_COLOR_MAP_OFFSET) + { + ansi = pair_index_to_fg_ansi (color); + if (ansi[0] != '\0') + fputs (ansi, stdout); + } + prev_color = color; + } + + putchar (ch); + } + + if (prev_color >= 0 && prev_color < TTY_COLOR_MAP_OFFSET) + fputs (reset, stdout); + } + + edit_clean (edit); + g_free (edit); + vfs_path_free (arg.file_vpath, TRUE); + vfs_shut (); + + return 0; +}