Mountain/Environment/SearchProvider.rs
1//! # SearchProvider (Environment)
2//!
3//! Implements the `SearchProvider` trait using the `grep-searcher` crate
4//! (the ripgrep library) for `MountainEnvironment`.
5//!
6//! ## Search architecture
7//!
8//! The search implementation uses a multi-threaded approach:
9//!
10//! 1. **Pattern compilation** - regex is compiled with case/word/multiline
11//! modifiers; plain-text queries are `regex::escape`d first.
12//! 2. **Parallel walking** - workspace files are walked via
13//! `WalkBuilder::build_parallel()`, respecting `.gitignore` and `.ignore`
14//! files automatically.
15//! 3. **Per-file search** - each file is searched individually using a `Sink`
16//! pattern (`PerFileSink`).
17//! 4. **Result aggregation** - matches are collected in a shared
18//! `Arc<Mutex<Vec<FileMatch>>>`.
19//!
20//! ## Search features
21//!
22//! - **Case sensitivity** - controlled by `isCaseSensitive` option
23//! - **Word matching** - controlled by `isWordMatch` option
24//! - **Regex support** - full regex via `grep-regex`
25//! - **Ignore files** - respects `.gitignore`, `.ignore`, and siblings
26//! - **Memory efficient** - streams results; never loads entire files
27//!
28//! ## Search result format
29//!
30//! Each match includes:
31//! - `resource` - file URI
32//! - `lineNumber` - 1-based line number
33//! - `preview` - matched text line (capped at 512 bytes)
34//! - `columns` - per-match `{start, end}` char-offset ranges (0-based, UTF-8
35//! code units to match VS Code's `ISearchRange`)
36//!
37//! ## VS Code reference
38//!
39//! - `vs/workbench/contrib/search/browser/searchWidget.ts`
40//! - `vs/platform/search/common/search.ts`
41//! - `vs/platform/search/common/fileSearch.ts`
42
43use std::{
44 io,
45 path::PathBuf,
46 sync::{Arc, Mutex},
47};
48
49use CommonLibrary::{Error::CommonError::CommonError, Search::SearchProvider::SearchProvider};
50use async_trait::async_trait;
51use grep_matcher::Matcher;
52use grep_regex::{RegexMatcher, RegexMatcherBuilder};
53use grep_searcher::{Searcher, SearcherBuilder, Sink, SinkMatch};
54use ignore::WalkBuilder;
55use serde::{Deserialize, Serialize};
56use serde_json::{Value, json};
57
58use super::{MountainEnvironment::MountainEnvironment, Utility};
59use crate::dev_log;
60
61// TODO: result pagination, cancellation via CancellationToken, include/exclude
62// patterns, context lines (before/after), file-type filtering, replacement
63// highlighting, progress reporting, multi-folder independent search, caching,
64// regex capture groups, search history, result export, performance metrics,
65// deduplication, glob file matching, result ranking, binary file handling,
66// symlink following, max file size limit, search timeout, hidden files,
67// multi-line regex.
68
69/// Mirrors VS Code's `ITextSearchQuery` shape (`vs/workbench/services/
70/// search/common/search.ts`). The workbench's Search view serialises
71/// the user's input into this struct and the ProxyChannel sends it as
72/// slot 0 of the `search:textSearch` call.
73///
74/// - `pattern`: the user's typed query
75/// - `isRegExp` (default `false`): when `false`, the pattern is
76/// `regex::escape`'d before compilation so a literal search for `obj.method(`
77/// doesn't blow up the regex parser.
78/// - `isCaseSensitive` (default `false`): controls the regex's case-insensitive
79/// flag.
80/// - `isWordMatch` (default `false`): wraps the pattern in `\b…\b` via
81/// `RegexMatcherBuilder::word(true)`.
82/// - `isMultiline` (default `false`): toggles `.` matching `\n`.
83#[derive(Deserialize, Debug, Default)]
84#[serde(rename_all = "camelCase")]
85struct TextSearchQuery {
86 pattern:String,
87
88 #[serde(default)]
89 is_case_sensitive:Option<bool>,
90
91 #[serde(default)]
92 is_word_match:Option<bool>,
93
94 #[serde(default)]
95 is_reg_exp:Option<bool>,
96
97 #[serde(default)]
98 is_multiline:Option<bool>,
99}
100
101/// Per-match column range within the preview line.
102///
103/// `start` and `end` are 0-based UTF-8 character offsets, NOT byte
104/// offsets - VS Code's renderer measures columns in code units, so
105/// pre-converting bytes→chars here keeps the workbench from
106/// mis-highlighting multi-byte UTF-8 lines (the search panel underlines
107/// the wrong substring otherwise).
108///
109/// VS Code's `ISearchRange` is 1-based for line numbers but 0-based
110/// for columns; the SkyBridge consumer adds the +1 line offset there.
111#[derive(Serialize, Clone, Debug)]
112#[serde(rename_all = "camelCase")]
113struct ColumnRange {
114 start:u64,
115
116 end:u64,
117}
118
119#[derive(Serialize, Clone, Debug)]
120#[serde(rename_all = "camelCase")]
121struct TextMatch {
122 preview:String,
123
124 /// 1-based line number (grep-searcher emits 1-based when
125 /// `line_number(true)` is configured on the SearcherBuilder).
126 line_number:u64,
127
128 /// Per-line ranges where the matcher actually matched. A single
129 /// line can contain multiple matches (e.g. `test test test`); each
130 /// gets its own range. Empty when match-position lookup failed -
131 /// in that case the renderer falls back to highlighting the whole
132 /// line.
133 columns:Vec<ColumnRange>,
134}
135
136#[derive(Serialize, Clone, Debug)]
137#[serde(rename_all = "camelCase")]
138struct FileMatch {
139 // URI
140 resource:String,
141
142 matches:Vec<TextMatch>,
143}
144
145// This Sink is designed to be created for each file. It holds a reference to
146// the central results vector and the path of the file it's searching.
147struct PerFileSink {
148 path:PathBuf,
149
150 results:Arc<Mutex<Vec<FileMatch>>>,
151
152 /// Cloned per-thread so the sink can re-run the matcher against the
153 /// raw line bytes to recover column ranges. `SinkMatch::bytes()`
154 /// gives us the matched line but not where in the line the matcher
155 /// hit; calling `Matcher::find_at(...)` ourselves is the documented
156 /// pattern for recovering that information.
157 matcher:RegexMatcher,
158}
159
160impl Sink for PerFileSink {
161 type Error = io::Error;
162
163 fn matched(&mut self, _Searcher:&Searcher, Mat:&SinkMatch<'_>) -> Result<bool, Self::Error> {
164 let RawLine = Mat.bytes();
165
166 // Trim trailing newline so the preview text the renderer shows
167 // doesn't carry a stray empty line break.
168 let TrimmedLen = if RawLine.ends_with(b"\r\n") {
169 RawLine.len().saturating_sub(2)
170 } else if RawLine.last() == Some(&b'\n') {
171 RawLine.len().saturating_sub(1)
172 } else {
173 RawLine.len()
174 };
175
176 let LineBytes = &RawLine[..TrimmedLen];
177
178 // Cap preview length at 512 chars - super-long minified lines
179 // would otherwise force the renderer to layout massive rows
180 // AND make the byte→char map below grow proportionally.
181 const PREVIEW_BYTE_CAP:usize = 512;
182
183 let CapBytes = LineBytes.len().min(PREVIEW_BYTE_CAP);
184
185 // Round down to the nearest UTF-8 boundary so `from_utf8_lossy`
186 // doesn't replace half a multibyte char with U+FFFD.
187 let SafeCap = (0..=CapBytes)
188 .rev()
189 .find(|&I| I == 0 || I == LineBytes.len() || (LineBytes[I] & 0xC0) != 0x80)
190 .unwrap_or(0);
191
192 let Preview = String::from_utf8_lossy(&LineBytes[..SafeCap]).to_string();
193
194 // `line_number(true)` was set on the SearcherBuilder so this
195 // returns Some(n) (1-based). Default to 1 if we somehow lose
196 // it - rendering "line 0" looked wrong even when the rest of
197 // the data was correct.
198 let LineNumber = Mat.line_number().unwrap_or(1);
199
200 // Build a byte→char map ONCE per line so every column lookup
201 // is O(log n) (binary search) instead of O(n) (the previous
202 // `char_indices().position()` per call). On lines with many
203 // matches this collapses the per-line work from quadratic to
204 // linear, which is the difference between a 6 s search and a
205 // minutes-long hang on workspaces that contain match-dense
206 // minified bundles.
207 let mut CharBoundaries:Vec<usize> = Vec::with_capacity(Preview.len() / 2 + 1);
208
209 for (B, _) in Preview.char_indices() {
210 CharBoundaries.push(B);
211 }
212
213 CharBoundaries.push(Preview.len()); // Sentinel for end-of-string.
214
215 let ByteToChar = |Byte:usize| -> u64 {
216 match CharBoundaries.binary_search(&Byte) {
217 Ok(Index) => Index as u64,
218
219 Err(Index) => Index as u64,
220 }
221 };
222
223 // Walk the line bytes and collect every sub-line range the
224 // matcher hits. Multiple matches per line are common
225 // (e.g. searching for `test` in `test test`); each becomes its
226 // own ColumnRange so the renderer underlines them all. Cap at
227 // `MAX_COLUMNS_PER_LINE` to bound work on pathological lines
228 // where a regex matches every character (e.g. `.` or `\w`
229 // against a long minified line).
230 const MAX_COLUMNS_PER_LINE:usize = 100;
231
232 let mut Columns:Vec<ColumnRange> = Vec::new();
233
234 let mut StartByte = 0usize;
235
236 // Search within the truncated preview so columns line up with
237 // the preview text the renderer will display.
238 let SearchBytes = &LineBytes[..SafeCap];
239
240 while StartByte <= SearchBytes.len() && Columns.len() < MAX_COLUMNS_PER_LINE {
241 match self.matcher.find_at(SearchBytes, StartByte) {
242 Ok(Some(M)) => {
243 if M.start() >= SearchBytes.len() {
244 break;
245 }
246
247 Columns.push(ColumnRange { start:ByteToChar(M.start()), end:ByteToChar(M.end()) });
248
249 // `M.end() == M.start()` happens for zero-width
250 // matches (e.g. `\b`); advance by one byte to
251 // avoid an infinite loop.
252 StartByte = if M.end() == M.start() { M.end() + 1 } else { M.end() };
253 },
254
255 _ => break,
256 }
257 }
258
259 // Since this sink is per-file, we know `self.path` is correct.
260 let FileURI = url::Url::from_file_path(&self.path)
261 .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Could not convert path to URL"))?
262 .to_string();
263
264 let NewMatch = TextMatch { preview:Preview, line_number:LineNumber, columns:Columns };
265
266 // Mutex acquired AFTER the column-range scan so contention
267 // doesn't serialise the per-line regex work across the
268 // `WalkBuilder::build_parallel()` workers.
269 let mut ResultsGuard = self
270 .results
271 .lock()
272 .map_err(|Error| io::Error::new(io::ErrorKind::Other, Error.to_string()))?;
273
274 // Find the entry for our file, or create it if it's the first match.
275 if let Some(FileMatch) = ResultsGuard.iter_mut().find(|fm| fm.resource == FileURI) {
276 FileMatch.matches.push(NewMatch);
277 } else {
278 ResultsGuard.push(FileMatch { resource:FileURI, matches:vec![NewMatch] });
279 }
280
281 // Continue searching
282 Ok(true)
283 }
284}
285
286#[async_trait]
287impl SearchProvider for MountainEnvironment {
288 async fn TextSearch(&self, QueryValue:Value, _OptionsValue:Value) -> Result<Value, CommonError> {
289 let Query:TextSearchQuery = serde_json::from_value(QueryValue)?;
290
291 dev_log!("search", "[SearchProvider] Performing text search for: {:?}", Query);
292
293 let mut Builder = RegexMatcherBuilder::new();
294
295 Builder
296 .case_insensitive(!Query.is_case_sensitive.unwrap_or(false))
297 .word(Query.is_word_match.unwrap_or(false))
298 .multi_line(Query.is_multiline.unwrap_or(false));
299
300 // When `isRegExp` is false/missing (the default for the Search
301 // view's plain-text mode), escape the pattern so literal
302 // searches for strings containing regex metacharacters
303 // (`.`, `(`, `[`, `*`, `?`, etc.) don't crash the compiler
304 // or silently match the wrong thing.
305 let CompiledPattern = if Query.is_reg_exp.unwrap_or(false) {
306 Query.pattern.clone()
307 } else {
308 regex::escape(&Query.pattern)
309 };
310
311 let Matcher = Builder.build(&CompiledPattern).map_err(|Error| {
312 CommonError::InvalidArgument { ArgumentName:"pattern".into(), Reason:Error.to_string() }
313 })?;
314
315 let AllMatches = Arc::new(Mutex::new(Vec::<FileMatch>::new()));
316
317 let Folders = self
318 .ApplicationState
319 .Workspace
320 .WorkspaceFolders
321 .lock()
322 .map_err(Utility::ErrorMapping::MapApplicationStateLockErrorToCommonError)?
323 .clone();
324
325 if Folders.is_empty() {
326 dev_log!("search", "warn: [SearchProvider] No workspace folders to search in.");
327
328 return Ok(json!([]));
329 }
330
331 for Folder in Folders {
332 if let Ok(FolderPath) = Folder.URI.to_file_path() {
333 // Use a parallel walker for better performance.
334 let Walker = WalkBuilder::new(FolderPath).build_parallel();
335
336 // The `search_parallel` method is not available on `Searcher`. We must process
337 // entries from the walker and call `search_path` individually.
338 Walker.run(|| {
339 // `line_number(true)` is mandatory - without it,
340 // `SinkMatch::line_number()` returns None and every
341 // match lands at line 0, which the renderer treats
342 // as "no line info" and collapses into an
343 // uncategorised count-of-zero. The default
344 // `Searcher::new()` constructor disables line
345 // numbers for performance.
346 let mut Searcher = SearcherBuilder::new().line_number(true).build();
347
348 let Matcher = Matcher.clone();
349
350 let AllMatches = AllMatches.clone();
351
352 Box::new(move |EntryResult| {
353 if let Ok(Entry) = EntryResult {
354 if Entry.file_type().map_or(false, |ft| ft.is_file()) {
355 // For each file, create a new sink that knows its path.
356 let Sink = PerFileSink {
357 path:Entry.path().to_path_buf(),
358 results:AllMatches.clone(),
359 matcher:Matcher.clone(),
360 };
361
362 if let Err(Error) = Searcher.search_path(&Matcher, Entry.path(), Sink) {
363 dev_log!(
364 "search",
365 "warn: [SearchProvider] Error searching path {}: {}",
366 Entry.path().display(),
367 Error
368 );
369 }
370 }
371 }
372
373 ignore::WalkState::Continue
374 })
375 });
376 }
377 }
378
379 let FinalMatches = AllMatches
380 .lock()
381 .map_err(|Error| CommonError::StateLockPoisoned { Context:Error.to_string() })?
382 .clone();
383
384 let TotalLineMatches:usize = FinalMatches.iter().map(|F| F.matches.len()).sum();
385
386 dev_log!(
387 "search",
388 "[SearchProvider] returned {} files / {} line-matches for pattern={:?}",
389 FinalMatches.len(),
390 TotalLineMatches,
391 Query.pattern
392 );
393
394 Ok(json!(FinalMatches))
395 }
396}