|
| 1 | +// Licensed to Elasticsearch B.V under one or more agreements. |
| 2 | +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. |
| 3 | +// See the LICENSE file in the project root for more information |
| 4 | + |
| 5 | +using System.Text; |
| 6 | + |
| 7 | +namespace Elastic.Documentation.Api.Infrastructure.Adapters.Search; |
| 8 | + |
| 9 | +public static class StringHighlightExtensions |
| 10 | +{ |
| 11 | + private const string MarkOpen = "<mark>"; |
| 12 | + private const string MarkClose = "</mark>"; |
| 13 | + |
| 14 | + /// <summary> |
| 15 | + /// Highlights search tokens in text by wrapping them with <mark> tags. |
| 16 | + /// Skips tokens that are already highlighted or are inside existing mark tags. |
| 17 | + /// </summary> |
| 18 | + /// <param name="text">The text to highlight tokens in</param> |
| 19 | + /// <param name="tokens">The search tokens to highlight</param> |
| 20 | + /// <param name="synonyms">Optional dictionary of synonyms to also highlight</param> |
| 21 | + /// <returns>Text with highlighted tokens</returns> |
| 22 | + public static string HighlightTokens( |
| 23 | + this string text, |
| 24 | + ReadOnlySpan<string> tokens, |
| 25 | + IReadOnlyDictionary<string, string[]>? synonyms = null) |
| 26 | + { |
| 27 | + if (tokens.Length == 0 || string.IsNullOrEmpty(text)) |
| 28 | + return text; |
| 29 | + |
| 30 | + var result = text; |
| 31 | + |
| 32 | + foreach (var token in tokens) |
| 33 | + { |
| 34 | + if (string.IsNullOrEmpty(token)) |
| 35 | + continue; |
| 36 | + |
| 37 | + // Highlight the token itself |
| 38 | + result = HighlightSingleToken(result, token); |
| 39 | + |
| 40 | + if (synonyms == null) |
| 41 | + continue; |
| 42 | + |
| 43 | + // Highlight synonyms for this token (direct lookup) |
| 44 | + if (synonyms.TryGetValue(token, out var tokenSynonyms)) |
| 45 | + { |
| 46 | + foreach (var synonym in tokenSynonyms) |
| 47 | + { |
| 48 | + var synonymToHighlight = ExtractSynonymTarget(synonym); |
| 49 | + if (!string.IsNullOrEmpty(synonymToHighlight)) |
| 50 | + result = HighlightSingleToken(result, synonymToHighlight); |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + // Also check for hard replacements where this token is the source |
| 55 | + // Format: "source => target" means when searching for "source", also highlight "target" |
| 56 | + foreach (var kvp in synonyms) |
| 57 | + { |
| 58 | + foreach (var synonym in kvp.Value) |
| 59 | + { |
| 60 | + if (string.IsNullOrEmpty(synonym) || !synonym.Contains("=>")) |
| 61 | + continue; |
| 62 | + |
| 63 | + var (source, target) = ParseHardReplacement(synonym); |
| 64 | + if (!string.IsNullOrEmpty(source) && |
| 65 | + !string.IsNullOrEmpty(target) && |
| 66 | + source.Equals(token, StringComparison.OrdinalIgnoreCase)) |
| 67 | + { |
| 68 | + result = HighlightSingleToken(result, target); |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + return result; |
| 75 | + } |
| 76 | + |
| 77 | + /// <summary> |
| 78 | + /// Extracts the target from a synonym entry, handling hard replacement format. |
| 79 | + /// For "source => target" returns "target", otherwise returns the original synonym. |
| 80 | + /// </summary> |
| 81 | + private static string? ExtractSynonymTarget(string? synonym) |
| 82 | + { |
| 83 | + if (string.IsNullOrEmpty(synonym)) |
| 84 | + return null; |
| 85 | + |
| 86 | + if (!synonym.Contains("=>")) |
| 87 | + return synonym; |
| 88 | + |
| 89 | + var (_, target) = ParseHardReplacement(synonym); |
| 90 | + return target; |
| 91 | + } |
| 92 | + |
| 93 | + /// <summary> |
| 94 | + /// Parses a hard replacement synonym format: "source => target" |
| 95 | + /// </summary> |
| 96 | + private static (string? Source, string? Target) ParseHardReplacement(string synonym) |
| 97 | + { |
| 98 | + var arrowIndex = synonym.IndexOf("=>", StringComparison.Ordinal); |
| 99 | + if (arrowIndex < 0) |
| 100 | + return (null, null); |
| 101 | + |
| 102 | + var source = synonym[..arrowIndex].Trim(); |
| 103 | + var target = synonym[(arrowIndex + 2)..].Trim(); |
| 104 | + |
| 105 | + return (source, target); |
| 106 | + } |
| 107 | + |
| 108 | + private static string HighlightSingleToken(string text, string token) |
| 109 | + { |
| 110 | + // Check if this exact token is already fully highlighted somewhere |
| 111 | + // This prevents double-highlighting |
| 112 | + if (text.Contains($"{MarkOpen}{token}{MarkClose}", StringComparison.OrdinalIgnoreCase)) |
| 113 | + return text; |
| 114 | + |
| 115 | + var sb = new StringBuilder(text.Length + 26); // Room for a couple of mark tags |
| 116 | + var textSpan = text.AsSpan(); |
| 117 | + var tokenSpan = token.AsSpan(); |
| 118 | + var pos = 0; |
| 119 | + |
| 120 | + while (pos < textSpan.Length) |
| 121 | + { |
| 122 | + var remaining = textSpan[pos..]; |
| 123 | + var matchIndex = remaining.IndexOf(tokenSpan, StringComparison.OrdinalIgnoreCase); |
| 124 | + |
| 125 | + if (matchIndex < 0) |
| 126 | + { |
| 127 | + // No more matches, append rest and exit |
| 128 | + _ = sb.Append(remaining); |
| 129 | + break; |
| 130 | + } |
| 131 | + |
| 132 | + var absoluteIndex = pos + matchIndex; |
| 133 | + |
| 134 | + // Check if we're inside mark tag syntax or inside mark tag content |
| 135 | + if (IsInsideMarkTagSyntax(textSpan, absoluteIndex, tokenSpan.Length) || IsInsideMarkTagContent(textSpan, absoluteIndex)) |
| 136 | + { |
| 137 | + // Append up to and including this match without highlighting |
| 138 | + _ = sb.Append(remaining[..(matchIndex + tokenSpan.Length)]); |
| 139 | + pos = absoluteIndex + token.Length; |
| 140 | + continue; |
| 141 | + } |
| 142 | + |
| 143 | + // Append text before match, then highlighted token (preserving original case) |
| 144 | + _ = sb.Append(remaining[..matchIndex]) |
| 145 | + .Append(MarkOpen) |
| 146 | + .Append(remaining.Slice(matchIndex, tokenSpan.Length)) |
| 147 | + .Append(MarkClose); |
| 148 | + |
| 149 | + pos = absoluteIndex + token.Length; |
| 150 | + } |
| 151 | + |
| 152 | + return sb.ToString(); |
| 153 | + } |
| 154 | + |
| 155 | + private static bool IsInsideMarkTagSyntax(ReadOnlySpan<char> text, int position, int tokenLength) |
| 156 | + { |
| 157 | + // Check if the match position overlaps with <mark> or </mark> tag syntax |
| 158 | + // We want to protect the literal tag strings, not arbitrary HTML |
| 159 | + |
| 160 | + var matchEnd = position + tokenLength; |
| 161 | + |
| 162 | + // Look for <mark> that contains our position |
| 163 | + var searchStart = Math.Max(0, position - 5); // <mark> is 6 chars, so look back 5 |
| 164 | + var searchEnd = Math.Min(text.Length, matchEnd + 6); |
| 165 | + var searchRegion = text[searchStart..searchEnd]; |
| 166 | + |
| 167 | + var markOpenIdx = searchRegion.IndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase); |
| 168 | + if (markOpenIdx >= 0) |
| 169 | + { |
| 170 | + var absoluteMarkStart = searchStart + markOpenIdx; |
| 171 | + var absoluteMarkEnd = absoluteMarkStart + MarkOpen.Length; |
| 172 | + // Check if our match overlaps with this <mark> tag |
| 173 | + if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart) |
| 174 | + return true; |
| 175 | + } |
| 176 | + |
| 177 | + // Look for </mark> that contains our position |
| 178 | + searchStart = Math.Max(0, position - 6); // </mark> is 7 chars |
| 179 | + searchEnd = Math.Min(text.Length, matchEnd + 7); |
| 180 | + searchRegion = text[searchStart..searchEnd]; |
| 181 | + |
| 182 | + var markCloseIdx = searchRegion.IndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase); |
| 183 | + if (markCloseIdx >= 0) |
| 184 | + { |
| 185 | + var absoluteMarkStart = searchStart + markCloseIdx; |
| 186 | + var absoluteMarkEnd = absoluteMarkStart + MarkClose.Length; |
| 187 | + // Check if our match overlaps with this </mark> tag |
| 188 | + if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart) |
| 189 | + return true; |
| 190 | + } |
| 191 | + |
| 192 | + return false; |
| 193 | + } |
| 194 | + |
| 195 | + private static bool IsInsideMarkTagContent(ReadOnlySpan<char> text, int position) |
| 196 | + { |
| 197 | + // Look backwards from position to find the last <mark> or </mark> |
| 198 | + var beforePosition = text[..position]; |
| 199 | + |
| 200 | + var lastOpen = beforePosition.LastIndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase); |
| 201 | + var lastClose = beforePosition.LastIndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase); |
| 202 | + |
| 203 | + // If we found an opening tag after the last closing tag, we're inside a mark's content |
| 204 | + return lastOpen > lastClose; |
| 205 | + } |
| 206 | +} |
0 commit comments