Skip to content
This repository was archived by the owner on Mar 5, 2022. It is now read-only.

Commit 47d2d84

Browse files
EvanDotProzmwangx
authored andcommitted
Show matched keywords as bold in result abstracts
The JSON schema has also been extended to include a `matches` array for each result. Each entry in the `matches` array is an object was `offset` and `phrase`. Closes #283. Signed-off-by: Zhiming Wang <[email protected]>
1 parent c66f129 commit 47d2d84

File tree

1 file changed

+23
-5
lines changed

1 file changed

+23
-5
lines changed

googler

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,7 +2121,12 @@ class GoogleParser(object):
21212121
if mime:
21222122
title = mime.text + ' ' + title
21232123
url = self.unwrap_link(a.attr('href'))
2124-
abstract = div_g.select('.st').text.replace('\n', '')
2124+
matched_keywords = []
2125+
abstract = ''
2126+
for childnode in div_g.select('.st').children:
2127+
if childnode.tag == 'b' and childnode.text != '...':
2128+
matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
2129+
abstract = abstract + childnode.text.replace('\n', '')
21252130
try:
21262131
metadata = div_g.select('.slp').text
21272132
metadata = metadata.replace('\u200e', '').replace(' - ', ', ').strip()
@@ -2141,7 +2146,7 @@ class GoogleParser(object):
21412146
continue
21422147
index += 1
21432148
self.results.append(Result(index, title, url, abstract,
2144-
metadata=metadata, sitelinks=sitelinks))
2149+
metadata=metadata, sitelinks=sitelinks, matches=matched_keywords))
21452150

21462151
# Showing results for ...
21472152
# Search instead for ...
@@ -2221,6 +2226,7 @@ class Result(object):
22212226
abstract : str
22222227
metadata : str or None
22232228
sitelinks : list
2229+
matches : list
22242230
22252231
Class Variables
22262232
---------------
@@ -2238,14 +2244,15 @@ class Result(object):
22382244
colors = None
22392245
urlexpand = True
22402246

2241-
def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None):
2247+
def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None):
22422248
index = str(index)
22432249
self.index = index
22442250
self.title = title
22452251
self.url = url
22462252
self.abstract = abstract
22472253
self.metadata = metadata
22482254
self.sitelinks = [] if sitelinks is None else sitelinks
2255+
self.matches = [] if matches is None else matches
22492256

22502257
self._urltable = {index: url}
22512258
subindex = 'a'
@@ -2276,7 +2283,7 @@ class Result(object):
22762283
else:
22772284
print(' %s%-*s %s %s' % (' ' * pre, indent, index + '.', title, url))
22782285

2279-
def _print_metadata_and_abstract(self, abstract, metadata=None, indent=5, pre=0):
2286+
def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=5, pre=0):
22802287
colors = self.colors
22812288
try:
22822289
columns, _ = os.get_terminal_size()
@@ -2290,6 +2297,15 @@ class Result(object):
22902297
print(' ' * (indent + pre) + metadata)
22912298

22922299
if colors:
2300+
# Start from the last match, as inserting the bold characters changes the offsets.
2301+
for match in reversed(matches or []):
2302+
abstract = (
2303+
abstract[: match['offset']]
2304+
+ '\033[1m'
2305+
+ match['phrase']
2306+
+ '\033[0m'
2307+
+ abstract[match['offset'] + len(match['phrase']) :]
2308+
)
22932309
print(colors.abstract, end='')
22942310
if columns > indent + 1 + pre:
22952311
# Try to fill to columns
@@ -2305,7 +2321,7 @@ class Result(object):
23052321
def print(self):
23062322
"""Print the result entry."""
23072323
self._print_title_and_url(self.index, self.title, self.url)
2308-
self._print_metadata_and_abstract(self.abstract, metadata=self.metadata)
2324+
self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
23092325

23102326
for sitelink in self.sitelinks:
23112327
self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, pre=4)
@@ -2322,6 +2338,8 @@ class Result(object):
23222338
obj['metadata'] = self.metadata
23232339
if self.sitelinks:
23242340
obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
2341+
if self.matches:
2342+
obj['matches'] = self.matches
23252343
return obj
23262344

23272345
def urltable(self):

0 commit comments

Comments
 (0)