Skip to content

Commit

Permalink
Rewrite php file cleaning step to be less regex intensive and support…
Browse files Browse the repository at this point in the history
… extreme cases better, fixes #10106
  • Loading branch information
Seldaek committed Sep 15, 2021
1 parent ffee8ca commit f813089
Show file tree
Hide file tree
Showing 6 changed files with 25,583 additions and 52 deletions.
72 changes: 20 additions & 52 deletions src/Composer/Autoload/ClassMapGenerator.php
Expand Up @@ -214,10 +214,7 @@ private static function filterByNamespace($classes, $filePath, $baseNamespace, $
*/
private static function findClasses($path)
{
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait';
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
$extraTypes = self::getExtraTypes();

// Use @ here instead of Silencer to actively suppress 'unhelpful' output
// @link https://github.com/composer/composer/pull/4886
Expand All @@ -241,57 +238,14 @@ private static function findClasses($path)
}

// return early if there is no chance of matching anything in this file
if (!preg_match('{\b(?:class|interface'.$extraTypes.')\s}i', $contents)) {
preg_match_all('{\b(?:class|interface'.$extraTypes.')\s}i', $contents, $matches);
if (!$matches) {
return array();
}

// strip heredocs/nowdocs
$heredocRegex = '{
# opening heredoc/nowdoc delimiter (word-chars)
<<<[ \t]*+([\'"]?)(\w++)\\1
# needs to be followed by a newline
(?:\r\n|\n|\r)
# the meat of it, matching line by line until end delimiter
(?:
# a valid line is optional white-space (possessive match) not followed by the end delimiter, then anything goes for the rest of the line
[\t ]*+(?!\\2 \b)[^\r\n]*+
# end of line(s)
[\r\n]++
)*
# end delimiter
[\t ]*+ \\2 (?=\b)
}x';

// run first assuming the file is valid unicode
$contentWithoutHeredoc = preg_replace($heredocRegex.'u', 'null', $contents);
if (null === $contentWithoutHeredoc) {
// run again without unicode support if the file failed to be parsed
$contents = preg_replace($heredocRegex, 'null', $contents);
} else {
$contents = $contentWithoutHeredoc;
}
unset($contentWithoutHeredoc);

// strip strings
$contents = preg_replace('{"[^"\\\\]*+(\\\\.[^"\\\\]*+)*+"|\'[^\'\\\\]*+(\\\\.[^\'\\\\]*+)*+\'}s', 'null', $contents);
// strip leading non-php code if needed
if (strpos($contents, '<?') !== 0) {
$contents = preg_replace('{^.+?<\?}s', '<?', $contents, 1, $replacements);
if ($replacements === 0) {
return array();
}
}
// strip non-php blocks in the file
$contents = preg_replace('{\?>(?:[^<]++|<(?!\?))*+<\?}s', '?><?', $contents);
// strip trailing non-php code if needed
$pos = strrpos($contents, '?>');
if (false !== $pos && false === strpos(substr($contents, $pos), '<?')) {
$contents = substr($contents, 0, $pos);
}
// strip comments if short open tags are in the file
if (preg_match('{(<\?)(?!(php|hh))}i', $contents)) {
$contents = preg_replace('{//.* | /\*(?:[^*]++|\*(?!/))*\*/}x', '', $contents);
}
$p = new PhpFileCleaner($contents, count($matches[0]));
$contents = $p->clean();
unset($p);

preg_match_all('{
(?:
Expand Down Expand Up @@ -328,4 +282,18 @@ private static function findClasses($path)

return $classes;
}

private static function getExtraTypes()
{
static $extraTypes = null;
if (null === $extraTypes) {
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait';
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
PhpFileCleaner::setTypeConfig(array_merge(array('class', 'interface'), array_filter(explode('|', $extraTypes))));
}

return $extraTypes;
}
}
228 changes: 228 additions & 0 deletions src/Composer/Autoload/PhpFileCleaner.php
@@ -0,0 +1,228 @@
<?php

namespace Composer\Autoload;

/**
* @author Jordi Boggiano <j.boggiano@seld.be>
* @internal
*/
class PhpFileCleaner
{
/** @var array<array{name: string, length: int, pattern: string}> */
private static $typeConfig;
/** @var string */
private static $restPattern;

/**
* @readonly
* @var string
*/
private $contents;

/**
* @readonly
* @var int
*/
private $len;

/**
* @readonly
* @var int
*/
private $maxMatches;

/** @var int */
private $index = 0;

public static function setTypeConfig($types)
{
foreach ($types as $type) {
self::$typeConfig[$type[0]] = array(
'name' => $type,
'length' => \strlen($type),
'pattern' => '{.\b(?<![\$:>])'.$type.'\s++[a-zA-Z_\x7f-\xff:][a-zA-Z0-9_\x7f-\xff:\-]*+}Ais',
);
}

self::$restPattern = '{[^?"\'</'.implode('', array_keys(self::$typeConfig)).']+}A';
}

public function __construct($contents, $maxMatches)
{
$this->contents = $contents;
$this->len = \strlen($this->contents);
$this->maxMatches = $maxMatches;
}

public function clean()
{
$clean = '';

while ($this->index < $this->len) {
$this->skipToPhp();
$clean .= '<?';

while ($this->index < $this->len) {
$char = $this->contents[$this->index];
if ($char === '?' && $this->peek('>')) {
$clean .= '?>';
$this->index += 2;
continue 2;
}

if ($char === '"') {
$this->skipString('"');
$clean .= 'null';
continue;
}

if ($char === "'") {
$this->skipString("'");
$clean .= 'null';
continue;
}

if ($char === "<" && $this->peek('<') && $this->match('{<<<[ \t]*+([\'"]?)([a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*+)\\1(?:\r\n|\n|\r)}A', $match)) {
$this->index += \strlen($match[0]);
$this->skipHeredoc($match[2]);
$clean .= 'null';
continue;
}

if ($char === '/') {
if ($this->peek('/')) {
$this->skipToNewline();
continue;
}
if ($this->peek('*')) {
$this->skipComment();
}
}

if ($this->maxMatches === 1 && isset(self::$typeConfig[$char])) {
$type = self::$typeConfig[$char];
if (
\substr($this->contents, $this->index, $type['length']) === $type['name']
&& \preg_match($type['pattern'], $this->contents, $match, 0, $this->index - 1)
) {
$clean .= $match[0];
return $clean;
}
}

$this->index += 1;
if ($this->match(self::$restPattern, $match)) {
$clean .= $char . $match[0];
$this->index += \strlen($match[0]);
} else {
$clean .= $char;
}
}
}

return $clean;
}

private function skipToPhp()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '<' && $this->peek('?')) {
$this->index += 2;
break;
}

$this->index += 1;
}
}

private function skipString($delimiter)
{
$this->index += 1;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '\\' && ($this->peek('\\') || $this->peek($delimiter))) {
$this->index += 2;
continue;
}
if ($this->contents[$this->index] === $delimiter) {
$this->index += 1;
break;
}
$this->index += 1;
}
}

private function skipComment()
{
$this->index += 2;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '*' && $this->peek('/')) {
$this->index += 2;
break;
}

$this->index += 1;
}
}

private function skipToNewline()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n") {
return;
}
$this->index += 1;
}
}

private function skipHeredoc($delimiter)
{
$firstDelimiterChar = $delimiter[0];
$delimiterLength = \strlen($delimiter);
$delimiterPattern = '{'.preg_quote($delimiter).'(?![a-zA-Z0-9_\x80-\xff])}A';

while ($this->index < $this->len) {
// check if we find the delimiter after some spaces/tabs
switch ($this->contents[$this->index]) {
case "\t":
case " ":
$this->index += 1;
continue 2;
case $firstDelimiterChar:
if (
\substr($this->contents, $this->index, $delimiterLength) === $delimiter
&& $this->match($delimiterPattern)
) {
$this->index += $delimiterLength;
return;
}
break;
}

// skip the rest of the line
while ($this->index < $this->len) {
$this->skipToNewline();

// skip newlines
while ($this->index < $this->len && ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n")) {
$this->index += 1;
}

break;
}
}
}

private function peek($char)
{
return $this->index + 1 < $this->len && $this->contents[$this->index + 1] === $char;
}

private function match($regex, array &$match = null)
{
if (\preg_match($regex, $this->contents, $match, 0, $this->index)) {
return true;
}

return false;
}
}
1 change: 1 addition & 0 deletions tests/Composer/Test/Autoload/ClassMapGeneratorTest.php
Expand Up @@ -252,6 +252,7 @@ public function testDump()
public function testCreateMapDoesNotHitRegexBacktraceLimit()
{
$expected = array(
'Faker\\Provider\\nl_BE\\Text'=> realpath(__DIR__) . '/Fixtures/pcrebacktracelimit/VeryLongNowdoc2.php',
'Foo\\StripNoise' => realpath(__DIR__) . '/Fixtures/pcrebacktracelimit/StripNoise.php',
'Foo\\VeryLongHeredoc' => realpath(__DIR__) . '/Fixtures/pcrebacktracelimit/VeryLongHeredoc.php',
'Foo\\ClassAfterLongHereDoc' => realpath(__DIR__) . '/Fixtures/pcrebacktracelimit/VeryLongHeredoc.php',
Expand Down
7 changes: 7 additions & 0 deletions tests/Composer/Test/Autoload/Fixtures/classmap/NonUnicode.php
@@ -0,0 +1,7 @@
<?php

echo <<<'NOT¶ING_TO_SEE_H¤RE'
class FailHeredocNonUnicodeNonAscii
{
}
NOT¶ING_TO_SEE_H¤RE;
2 changes: 2 additions & 0 deletions tests/Composer/Test/Autoload/Fixtures/classmap/StripNoise.php
Expand Up @@ -19,6 +19,8 @@ class FailHeredocWhitespace
}
WHITESPACE . <<< MARKERINTEXT
In PHP < 7.3, the docblock marker could occur in the text as long as it did not occur at the very start of the line.
MARKERINTEXTwithtrail
MARKERINTEXT_
class FailHeredocMarkerInText
{
}
Expand Down

0 comments on commit f813089

Please sign in to comment.