Skip to content

Commit 5642ec2

Browse files
committed
Improve the tokenizer and remove pass string by value
1 parent 4e76f88 commit 5642ec2

File tree

3 files changed

+40
-44
lines changed

3 files changed

+40
-44
lines changed

benches/benchmarks.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,28 +9,28 @@ const QUERY_100_CHAR: &str = "SELECT name, COUNT(name) FROM commits GROUP BY nam
99

1010
fn tokenizer_100_char_benchmark(c: &mut Criterion) {
1111
c.bench_function("Tokenizer 100 Char", |b| {
12-
b.iter(|| Tokenizer::tokenize(black_box(QUERY_100_CHAR.to_owned())))
12+
b.iter(|| Tokenizer::tokenize(black_box(QUERY_100_CHAR)))
1313
});
1414
}
1515

1616
fn tokenizer_100k_char_benchmark(c: &mut Criterion) {
1717
let query_100k_char = QUERY_100_CHAR.repeat(100_000 / 100);
1818
c.bench_function("Tokenizer 100K Char", |b| {
19-
b.iter(|| Tokenizer::tokenize(black_box(query_100k_char.to_owned())))
19+
b.iter(|| Tokenizer::tokenize(black_box(&query_100k_char)))
2020
});
2121
}
2222

2323
fn tokenizer_1m_char_benchmark(c: &mut Criterion) {
2424
let query_100k_char = QUERY_100_CHAR.repeat(1_000_000 / 100);
2525
c.bench_function("Tokenizer 1M Char", |b| {
26-
b.iter(|| Tokenizer::tokenize(black_box(query_100k_char.to_owned())))
26+
b.iter(|| Tokenizer::tokenize(black_box(&query_100k_char)))
2727
});
2828
}
2929

3030
fn tokenizer_10m_char_benchmark(c: &mut Criterion) {
3131
let query_100k_char = QUERY_100_CHAR.repeat(10_000_000 / 100);
3232
c.bench_function("Tokenizer 10M Char", |b| {
33-
b.iter(|| Tokenizer::tokenize(black_box(query_100k_char.to_owned())))
33+
b.iter(|| Tokenizer::tokenize(black_box(&query_100k_char)))
3434
});
3535
}
3636

crates/gitql-parser/src/tokenizer.rs

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,22 @@ use crate::token::Token;
44
use crate::token::TokenKind;
55
use crate::token::GITQL_RESERVED_KEYWORDS;
66

7-
pub struct Tokenizer {
8-
pub(crate) content: Vec<char>,
9-
pub(crate) content_len: usize,
10-
pub(crate) index: usize,
11-
12-
pub(crate) line_start: u32,
13-
pub(crate) line_end: u32,
14-
pub(crate) column_start: u32,
15-
pub(crate) column_end: u32,
7+
pub struct Tokenizer<'a> {
8+
content: &'a [char],
9+
content_len: usize,
10+
index: usize,
11+
12+
line_start: u32,
13+
line_end: u32,
14+
column_start: u32,
15+
column_end: u32,
1616
}
1717

18-
impl Tokenizer {
19-
pub(crate) fn new(chars: Vec<char>) -> Tokenizer {
20-
let content_len = chars.len();
18+
impl<'a> Tokenizer<'a> {
19+
pub(crate) fn new(chars: &'a [char]) -> Tokenizer<'a> {
2120
Tokenizer {
2221
content: chars,
23-
content_len,
22+
content_len: chars.len(),
2423
index: 0,
2524

2625
line_start: 1,
@@ -30,9 +29,9 @@ impl Tokenizer {
3029
}
3130
}
3231

33-
pub fn tokenize(content: String) -> Result<Vec<Token>, Box<Diagnostic>> {
34-
let mut tokenizer = Tokenizer::new(content.chars().collect());
35-
tokenizer.tokenize_characters()
32+
pub fn tokenize(chars: &'a str) -> Result<Vec<Token>, Box<Diagnostic>> {
33+
let chars: Vec<char> = chars.chars().collect();
34+
Tokenizer::new(&chars).tokenize_characters()
3635
}
3736

3837
fn current_source_location(&self) -> SourceLocation {
@@ -340,7 +339,7 @@ impl Tokenizer {
340339

341340
// Consume `!`
342341
self.advance();
343-
let kind = if self.index < len && self.content[self.index] == '=' {
342+
let kind = if self.is_current_char('=') {
344343
// Consume `=`
345344
self.advance();
346345
TokenKind::BangEqual
@@ -420,7 +419,7 @@ impl Tokenizer {
420419
self.advance();
421420

422421
// Make sure first character is alphabetic
423-
if self.has_next() && !self.content[self.index].is_alphabetic() {
422+
if !self.is_current_char_func(|c| c.is_alphanumeric()) {
424423
return Err(Diagnostic::error(
425424
"Global variable name must start with alphabetic character",
426425
)
@@ -429,7 +428,7 @@ impl Tokenizer {
429428
.as_boxed());
430429
}
431430

432-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
431+
while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
433432
self.advance();
434433
}
435434

@@ -444,8 +443,7 @@ impl Tokenizer {
444443

445444
fn consume_identifier(&mut self) -> Token {
446445
let start_index = self.index;
447-
448-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
446+
while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
449447
self.advance();
450448
}
451449

@@ -467,7 +465,7 @@ impl Tokenizer {
467465
// Advance '`'
468466
self.advance();
469467

470-
while self.has_next() && !self.is_current_char('`') {
468+
while !self.is_current_char('`') {
471469
self.advance();
472470
}
473471

@@ -490,16 +488,16 @@ impl Tokenizer {
490488
fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
491489
let start_index = self.index;
492490

493-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
491+
while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
494492
self.advance();
495493
}
496494

497495
let mut is_float_value = false;
498-
if self.has_next() && self.is_current_char('.') {
496+
if self.is_current_char('.') {
499497
self.advance();
500498

501499
is_float_value = true;
502-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
500+
while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
503501
self.advance();
504502
}
505503
}
@@ -538,7 +536,7 @@ impl Tokenizer {
538536

539537
fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
540538
let start_index = self.index;
541-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
539+
while self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
542540
self.advance();
543541
}
544542

@@ -575,8 +573,7 @@ impl Tokenizer {
575573

576574
fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
577575
let start_index = self.index;
578-
while self.has_next() && self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c))
579-
{
576+
while self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c)) {
580577
self.advance();
581578
}
582579

@@ -613,7 +610,7 @@ impl Tokenizer {
613610

614611
fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
615612
let start_index = self.index;
616-
while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
613+
while self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
617614
self.advance();
618615
}
619616

@@ -685,7 +682,7 @@ impl Tokenizer {
685682
self.advance();
686683

687684
let mut buffer = String::new();
688-
while self.has_next() && !self.is_current_char(around) {
685+
while !self.is_current_char(around) {
689686
if !self.is_current_char('\\') {
690687
buffer.push(self.content[self.index]);
691688
self.advance();
@@ -748,7 +745,7 @@ impl Tokenizer {
748745
// Advance `--`
749746
self.advance_n(2);
750747

751-
while self.has_next() && !self.is_current_char('\n') {
748+
while !self.is_current_char('\n') {
752749
self.advance();
753750
}
754751

src/main.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ fn main() {
5555
let mut env = create_gitql_environment();
5656
let query =
5757
fs::read_to_string(script_file).expect("Should have been able to read the file");
58-
execute_gitql_query(query, &arguments, &repos, &mut env, &mut reporter);
58+
execute_gitql_query(&query, &arguments, &repos, &mut env, &mut reporter);
5959
}
6060
Command::QueryMode(query, arguments) => {
6161
let mut reporter = diagnostic_reporter::DiagnosticReporter::default();
@@ -70,8 +70,7 @@ fn main() {
7070

7171
let repos = git_repos_result.ok().unwrap();
7272
let mut env = create_gitql_environment();
73-
74-
execute_gitql_query(query, &arguments, &repos, &mut env, &mut reporter);
73+
execute_gitql_query(&query, &arguments, &repos, &mut env, &mut reporter);
7574
}
7675
Command::Help => {
7776
arguments::print_help_list();
@@ -117,7 +116,7 @@ fn launch_gitql_repl(arguments: &Arguments) {
117116
}
118117

119118
execute_gitql_query(
120-
input.to_owned(),
119+
&input,
121120
arguments,
122121
&git_repositories,
123122
&mut global_env,
@@ -162,7 +161,7 @@ fn launch_gitql_repl(arguments: &Arguments) {
162161
}
163162

164163
execute_gitql_query(
165-
stdin_input.to_owned(),
164+
stdin_input,
166165
arguments,
167166
&git_repositories,
168167
&mut global_env,
@@ -175,17 +174,17 @@ fn launch_gitql_repl(arguments: &Arguments) {
175174
}
176175

177176
fn execute_gitql_query(
178-
query: String,
177+
query: &str,
179178
arguments: &Arguments,
180179
repos: &[gix::Repository],
181180
env: &mut Environment,
182181
reporter: &mut DiagnosticReporter,
183182
) {
184183
let front_start = std::time::Instant::now();
185-
let tokenizer_result = Tokenizer::tokenize(query.clone());
184+
let tokenizer_result = Tokenizer::tokenize(query);
186185
if tokenizer_result.is_err() {
187186
let diagnostic = tokenizer_result.err().unwrap();
188-
reporter.report_diagnostic(&query, *diagnostic);
187+
reporter.report_diagnostic(query, *diagnostic);
189188
std::process::exit(1);
190189
}
191190

@@ -197,7 +196,7 @@ fn execute_gitql_query(
197196
let parser_result = parser::parse_gql(tokens, env);
198197
if parser_result.is_err() {
199198
let diagnostic = parser_result.err().unwrap();
200-
reporter.report_diagnostic(&query, *diagnostic);
199+
reporter.report_diagnostic(query, *diagnostic);
201200
std::process::exit(1);
202201
}
203202

0 commit comments

Comments
 (0)