Skip to content

Commit 4e0354b

Browse files
authored
Normalize the title and description we return over fetch(), fix highlighting (#2311)
* Normalize the title and description we return over fetch(), fix highlighting * highlight synonyms * Bidirectional synonyms now also parses source => target replacement syntax
1 parent dc6d79d commit 4e0354b

File tree

8 files changed

+949
-48
lines changed

8 files changed

+949
-48
lines changed

src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,18 @@ public required IReadOnlyDictionary<string, string[]> Synonyms
2525
var targets = new List<string[]>();
2626
foreach (var s in a)
2727
{
28-
if (s.Contains(' ') || s.Contains("=>"))
28+
if (s.Contains(' '))
2929
continue;
3030

3131
List<string> newTarget = [s];
32+
if (s.Contains("=>"))
33+
{
34+
var tokens = s.Split("=>");
35+
if (tokens.Length > 1)
36+
newTarget = [tokens[0].Trim()];
37+
else
38+
continue;
39+
}
3240
newTarget.AddRange(a.Except([s]));
3341
targets.Add(newTarget.ToArray());
3442
}

src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/SearchResults/SearchResultsListItem.tsx

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,7 @@ export function SearchResultListItem({
143143
`}
144144
>
145145
<SanitizedHtmlContent
146-
htmlContent={
147-
result.highlightedTitle ?? result.title
148-
}
146+
htmlContent={result.title}
149147
ellipsis={false}
150148
/>
151149
</div>
@@ -164,14 +162,10 @@ export function SearchResultListItem({
164162
//width: 90%;
165163
`}
166164
>
167-
{result.highlightedBody ? (
168-
<SanitizedHtmlContent
169-
htmlContent={result.highlightedBody}
170-
ellipsis={true}
171-
/>
172-
) : (
173-
<span>{result.description}</span>
174-
)}
165+
<SanitizedHtmlContent
166+
htmlContent={result.description}
167+
ellipsis={true}
168+
/>
175169
</div>
176170
</EuiText>
177171
{result.parents.length > 0 && (

src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ const SearchResultItem = z.object({
3535
description: z.string(),
3636
score: z.number(),
3737
parents: z.array(SearchResultItemParent),
38-
highlightedTitle: z.string().nullish(),
39-
highlightedBody: z.string().nullish(),
4038
})
4139

4240
export type SearchResultItem = z.infer<typeof SearchResultItem>

src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,5 @@ public record SearchResultItem
8282
public required string Title { get; init; }
8383
public required string Description { get; init; }
8484
public required SearchResultItemParent[] Parents { get; init; }
85-
public string[]? Headings { get; init; }
8685
public float Score { get; init; }
87-
public string? HighlightedBody { get; init; }
88-
89-
public string? HighlightedTitle { get; init; }
9086
}

src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs

Lines changed: 26 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -276,18 +276,13 @@ public async Task<SearchResult> SearchImplementation(string query, int pageNumbe
276276
)
277277
)
278278
.Highlight(h => h
279-
.RequireFieldMatch(true)
280279
.Fields(f => f
281-
.Add(Infer.Field<DocumentationDocument>(d => d.SearchTitle.Suffix("completion")), hf => hf
280+
.Add(Infer.Field<DocumentationDocument>(d => d.Title), hf => hf
282281
.FragmentSize(150)
283282
.NumberOfFragments(3)
284283
.NoMatchSize(150)
285-
.BoundaryChars(":.!?\t\n")
286-
.BoundaryScanner(BoundaryScanner.Sentence)
287-
.BoundaryMaxScan(15)
288-
.FragmentOffset(0)
289284
.HighlightQuery(q => q.Match(m => m
290-
.Field(d => d.SearchTitle.Suffix("completion"))
285+
.Field(d => d.Title)
291286
.Query(searchQuery)
292287
.Analyzer("highlight_analyzer")
293288
))
@@ -297,15 +292,6 @@ public async Task<SearchResult> SearchImplementation(string query, int pageNumbe
297292
.FragmentSize(150)
298293
.NumberOfFragments(3)
299294
.NoMatchSize(150)
300-
.BoundaryChars(":.!?\t\n")
301-
.BoundaryScanner(BoundaryScanner.Sentence)
302-
.BoundaryMaxScan(15)
303-
.FragmentOffset(0)
304-
.HighlightQuery(q => q.Match(m => m
305-
.Field(d => d.StrippedBody)
306-
.Query(searchQuery)
307-
.Analyzer("highlight_analyzer")
308-
))
309295
.PreTags(preTag)
310296
.PostTags(postTag))
311297
)
@@ -324,7 +310,7 @@ public async Task<SearchResult> SearchImplementation(string query, int pageNumbe
324310
else
325311
_logger.LogInformation("RRF search completed for '{Query}'. Total hits: {TotalHits}", query, response.Total);
326312

327-
return ProcessSearchResponse(response);
313+
return ProcessSearchResponse(response, searchQuery, _searchConfiguration.SynonymBiDirectional);
328314
}
329315
catch (Exception ex)
330316
{
@@ -333,9 +319,13 @@ public async Task<SearchResult> SearchImplementation(string query, int pageNumbe
333319
}
334320
}
335321

336-
private static SearchResult ProcessSearchResponse(SearchResponse<DocumentationDocument> response)
322+
private static SearchResult ProcessSearchResponse(
323+
SearchResponse<DocumentationDocument> response,
324+
string searchQuery,
325+
IReadOnlyDictionary<string, string[]> synonyms)
337326
{
338327
var totalHits = (int)response.Total;
328+
var searchTokens = searchQuery.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
339329

340330
var results = response.Documents.Select((doc, index) =>
341331
{
@@ -348,36 +338,42 @@ private static SearchResult ProcessSearchResponse(SearchResponse<DocumentationDo
348338
if (highlights != null)
349339
{
350340
if (highlights.TryGetValue("stripped_body", out var bodyHighlights) && bodyHighlights.Count > 0)
351-
highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.TrimEnd('.', ' ', '-')));
341+
highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.Trim(['|', ' ', '.', '-'])));
352342

353-
if (highlights.TryGetValue("search_title.completion", out var titleHighlights) && titleHighlights.Count > 0)
354-
highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.TrimEnd('.', ' ', '-')));
343+
if (highlights.TryGetValue("title", out var titleHighlights) && titleHighlights.Count > 0)
344+
highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.Trim(['|', ' ', '.', '-'])));
355345
}
356346

347+
var title = (highlightedTitle ?? doc.Title).HighlightTokens(searchTokens, synonyms);
348+
var description = (!string.IsNullOrWhiteSpace(highlightedBody) ? highlightedBody : doc.Description ?? string.Empty)
349+
.Replace("\r\n", " ")
350+
.Replace("\n", " ")
351+
.Replace("\r", " ")
352+
.Trim(['|', ' '])
353+
.HighlightTokens(searchTokens, synonyms);
354+
357355
return new SearchResultItem
358356
{
359357
Url = doc.Url,
360-
Title = doc.Title,
358+
Title = title,
361359
Type = doc.Type,
362-
Description = doc.Description ?? string.Empty,
363-
Headings = doc.Headings,
360+
Description = description,
364361
Parents = doc.Parents.Select(parent => new SearchResultItemParent
365362
{
366363
Title = parent.Title,
367364
Url = parent.Url
368365
}).ToArray(),
369-
Score = (float)(hit?.Score ?? 0.0),
370-
HighlightedTitle = highlightedTitle,
371-
HighlightedBody = highlightedBody
366+
Score = (float)(hit?.Score ?? 0.0)
372367
};
373368
}).ToList();
374369

375370
// Extract aggregations
376371
var aggregations = new Dictionary<string, long>();
377-
if (response.Aggregations?.TryGetValue("type", out var typeAgg) == true && typeAgg is StringTermsAggregate stringTermsAgg)
372+
var terms = response.Aggregations?.GetStringTerms("type");
373+
if (terms is not null)
378374
{
379-
foreach (var bucket in stringTermsAgg.Buckets)
380-
aggregations[bucket.Key.ToString()!] = bucket.DocCount;
375+
foreach (var bucket in terms.Buckets)
376+
aggregations[bucket.Key.ToString()] = bucket.DocCount;
381377
}
382378

383379
return new SearchResult
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Text;
6+
7+
namespace Elastic.Documentation.Api.Infrastructure.Adapters.Search;
8+
9+
public static class StringHighlightExtensions
10+
{
11+
private const string MarkOpen = "<mark>";
12+
private const string MarkClose = "</mark>";
13+
14+
/// <summary>
15+
/// Highlights search tokens in text by wrapping them with &lt;mark&gt; tags.
16+
/// Skips tokens that are already highlighted or are inside existing mark tags.
17+
/// </summary>
18+
/// <param name="text">The text to highlight tokens in</param>
19+
/// <param name="tokens">The search tokens to highlight</param>
20+
/// <param name="synonyms">Optional dictionary of synonyms to also highlight</param>
21+
/// <returns>Text with highlighted tokens</returns>
22+
public static string HighlightTokens(
23+
this string text,
24+
ReadOnlySpan<string> tokens,
25+
IReadOnlyDictionary<string, string[]>? synonyms = null)
26+
{
27+
if (tokens.Length == 0 || string.IsNullOrEmpty(text))
28+
return text;
29+
30+
var result = text;
31+
32+
foreach (var token in tokens)
33+
{
34+
if (string.IsNullOrEmpty(token))
35+
continue;
36+
37+
// Highlight the token itself
38+
result = HighlightSingleToken(result, token);
39+
40+
if (synonyms == null)
41+
continue;
42+
43+
// Highlight synonyms for this token (direct lookup)
44+
if (synonyms.TryGetValue(token, out var tokenSynonyms))
45+
{
46+
foreach (var synonym in tokenSynonyms)
47+
{
48+
var synonymToHighlight = ExtractSynonymTarget(synonym);
49+
if (!string.IsNullOrEmpty(synonymToHighlight))
50+
result = HighlightSingleToken(result, synonymToHighlight);
51+
}
52+
}
53+
54+
// Also check for hard replacements where this token is the source
55+
// Format: "source => target" means when searching for "source", also highlight "target"
56+
foreach (var kvp in synonyms)
57+
{
58+
foreach (var synonym in kvp.Value)
59+
{
60+
if (string.IsNullOrEmpty(synonym) || !synonym.Contains("=>"))
61+
continue;
62+
63+
var (source, target) = ParseHardReplacement(synonym);
64+
if (!string.IsNullOrEmpty(source) &&
65+
!string.IsNullOrEmpty(target) &&
66+
source.Equals(token, StringComparison.OrdinalIgnoreCase))
67+
{
68+
result = HighlightSingleToken(result, target);
69+
}
70+
}
71+
}
72+
}
73+
74+
return result;
75+
}
76+
77+
/// <summary>
78+
/// Extracts the target from a synonym entry, handling hard replacement format.
79+
/// For "source => target" returns "target", otherwise returns the original synonym.
80+
/// </summary>
81+
private static string? ExtractSynonymTarget(string? synonym)
82+
{
83+
if (string.IsNullOrEmpty(synonym))
84+
return null;
85+
86+
if (!synonym.Contains("=>"))
87+
return synonym;
88+
89+
var (_, target) = ParseHardReplacement(synonym);
90+
return target;
91+
}
92+
93+
/// <summary>
94+
/// Parses a hard replacement synonym format: "source => target"
95+
/// </summary>
96+
private static (string? Source, string? Target) ParseHardReplacement(string synonym)
97+
{
98+
var arrowIndex = synonym.IndexOf("=>", StringComparison.Ordinal);
99+
if (arrowIndex < 0)
100+
return (null, null);
101+
102+
var source = synonym[..arrowIndex].Trim();
103+
var target = synonym[(arrowIndex + 2)..].Trim();
104+
105+
return (source, target);
106+
}
107+
108+
private static string HighlightSingleToken(string text, string token)
109+
{
110+
// Check if this exact token is already fully highlighted somewhere
111+
// This prevents double-highlighting
112+
if (text.Contains($"{MarkOpen}{token}{MarkClose}", StringComparison.OrdinalIgnoreCase))
113+
return text;
114+
115+
var sb = new StringBuilder(text.Length + 26); // Room for a couple of mark tags
116+
var textSpan = text.AsSpan();
117+
var tokenSpan = token.AsSpan();
118+
var pos = 0;
119+
120+
while (pos < textSpan.Length)
121+
{
122+
var remaining = textSpan[pos..];
123+
var matchIndex = remaining.IndexOf(tokenSpan, StringComparison.OrdinalIgnoreCase);
124+
125+
if (matchIndex < 0)
126+
{
127+
// No more matches, append rest and exit
128+
_ = sb.Append(remaining);
129+
break;
130+
}
131+
132+
var absoluteIndex = pos + matchIndex;
133+
134+
// Check if we're inside mark tag syntax or inside mark tag content
135+
if (IsInsideMarkTagSyntax(textSpan, absoluteIndex, tokenSpan.Length) || IsInsideMarkTagContent(textSpan, absoluteIndex))
136+
{
137+
// Append up to and including this match without highlighting
138+
_ = sb.Append(remaining[..(matchIndex + tokenSpan.Length)]);
139+
pos = absoluteIndex + token.Length;
140+
continue;
141+
}
142+
143+
// Append text before match, then highlighted token (preserving original case)
144+
_ = sb.Append(remaining[..matchIndex])
145+
.Append(MarkOpen)
146+
.Append(remaining.Slice(matchIndex, tokenSpan.Length))
147+
.Append(MarkClose);
148+
149+
pos = absoluteIndex + token.Length;
150+
}
151+
152+
return sb.ToString();
153+
}
154+
155+
private static bool IsInsideMarkTagSyntax(ReadOnlySpan<char> text, int position, int tokenLength)
156+
{
157+
// Check if the match position overlaps with <mark> or </mark> tag syntax
158+
// We want to protect the literal tag strings, not arbitrary HTML
159+
160+
var matchEnd = position + tokenLength;
161+
162+
// Look for <mark> that contains our position
163+
var searchStart = Math.Max(0, position - 5); // <mark> is 6 chars, so look back 5
164+
var searchEnd = Math.Min(text.Length, matchEnd + 6);
165+
var searchRegion = text[searchStart..searchEnd];
166+
167+
var markOpenIdx = searchRegion.IndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase);
168+
if (markOpenIdx >= 0)
169+
{
170+
var absoluteMarkStart = searchStart + markOpenIdx;
171+
var absoluteMarkEnd = absoluteMarkStart + MarkOpen.Length;
172+
// Check if our match overlaps with this <mark> tag
173+
if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart)
174+
return true;
175+
}
176+
177+
// Look for </mark> that contains our position
178+
searchStart = Math.Max(0, position - 6); // </mark> is 7 chars
179+
searchEnd = Math.Min(text.Length, matchEnd + 7);
180+
searchRegion = text[searchStart..searchEnd];
181+
182+
var markCloseIdx = searchRegion.IndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase);
183+
if (markCloseIdx >= 0)
184+
{
185+
var absoluteMarkStart = searchStart + markCloseIdx;
186+
var absoluteMarkEnd = absoluteMarkStart + MarkClose.Length;
187+
// Check if our match overlaps with this </mark> tag
188+
if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart)
189+
return true;
190+
}
191+
192+
return false;
193+
}
194+
195+
private static bool IsInsideMarkTagContent(ReadOnlySpan<char> text, int position)
196+
{
197+
// Look backwards from position to find the last <mark> or </mark>
198+
var beforePosition = text[..position];
199+
200+
var lastOpen = beforePosition.LastIndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase);
201+
var lastClose = beforePosition.LastIndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase);
202+
203+
// If we found an opening tag after the last closing tag, we're inside a mark's content
204+
return lastOpen > lastClose;
205+
}
206+
}

0 commit comments

Comments
 (0)