From 2ece893bde5411a35308e904e46c669e5bfc1adc Mon Sep 17 00:00:00 2001 From: Fabien Potencier Date: Fri, 6 Feb 2026 11:35:17 +0100 Subject: [PATCH 1/2] =?UTF-8?q?Replace=20O(n=C2=B2)=20token=20deduplicatio?= =?UTF-8?q?n=20with=20O(1)=20hash-based=20lookup=20in=20ParseTokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/Tokens/ParseTokens.php | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/Tokens/ParseTokens.php b/src/Tokens/ParseTokens.php index d83076d..040852f 100644 --- a/src/Tokens/ParseTokens.php +++ b/src/Tokens/ParseTokens.php @@ -15,6 +15,7 @@ public function __invoke(string $content, Language $language): array { $tokens = []; + $seen = []; // Match tokens from patterns foreach ($language->getPatterns() as $key => $pattern) { @@ -33,34 +34,30 @@ public function __invoke(string $content, Language $language): array continue; } + $tokenType = $pattern->getTokenType(); + $tokenTypeValue = $tokenType->getValue(); + foreach ($match as $item) { $offset = $item[1]; $value = $item[0]; - $token = new Token( + $hashKey = $offset . ':' . $tokenTypeValue . ':' . $value; + + if (isset($seen[$hashKey])) { + continue; + } + + $seen[$hashKey] = true; + + $tokens[] = new Token( offset: $offset, value: $value, - type: $pattern->getTokenType(), + type: $tokenType, pattern: $pattern, ); - - if (! $this->tokenAlreadyPresent($tokens, $token)) { - $tokens[] = $token; - } } } return $tokens; } - - private function tokenAlreadyPresent(array $tokens, Token $token): bool - { - foreach ($tokens as $tokenToCompare) { - if ($tokenToCompare->equals($token)) { - return true; - } - } - - return false; - } } From aea918ca612c9f62be31f820ccdf85a8b3a5a05d Mon Sep 17 00:00:00 2001 From: Fabien Potencier Date: Fri, 6 Feb 2026 11:36:43 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Replace=20O(n=C2=B2)=20token=20grouping=20w?= =?UTF-8?q?ith=20O(n)=20sorted=20scan=20in=20GroupTokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/Tokens/GroupTokens.php | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/Tokens/GroupTokens.php b/src/Tokens/GroupTokens.php index ba15301..7171cab 100644 --- a/src/Tokens/GroupTokens.php +++ b/src/Tokens/GroupTokens.php @@ -25,28 +25,43 @@ public function __invoke(array $tokens): array /** @var Token[] $groupedTokens */ $groupedTokens = []; - while ($token = current($tokens)) { - $token = $token->cloneWithoutParent(); + $count = count($tokens); + $removed = []; + for ($i = 0; $i < $count; $i++) { + if (isset($removed[$i])) { + continue; + } + + $token = $tokens[$i]->cloneWithoutParent(); - foreach ($tokens as $compareKey => $compareToken) { - if ($token->equals($compareToken)) { + // Since tokens are sorted by start, only check subsequent tokens + // that could overlap (start < token->end) + for ($j = $i + 1; $j < $count; $j++) { + if (isset($removed[$j])) { continue; } - if ($token->containsOrOverlaps($compareToken)) { - if ($token->canContain($compareToken)) { - $token->addChild($compareToken); - } + $compareToken = $tokens[$j]; + + // Since tokens are sorted by start position, + // once compareToken->start >= token->end, no more overlaps possible + if ($compareToken->start >= $token->end) { + break; + } - unset($tokens[$compareKey]); + // At this point we know: token->start <= compareToken->start < token->end + // and they are not equal (different indices, and sorted order means + // same-start tokens differ in end). This means containsOrOverlaps is true. + if ($token->canContain($compareToken)) { + $token->addChild($compareToken); } + + $removed[$j] = true; } if ($token->parent === null) { $groupedTokens[] = $token; } - - next($tokens); } return $groupedTokens;