Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite php file cleaning step to be less regex intensive and support extreme cases better #10107

Merged
merged 1 commit into from Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
72 changes: 20 additions & 52 deletions src/Composer/Autoload/ClassMapGenerator.php
Expand Up @@ -214,10 +214,7 @@ private static function filterByNamespace($classes, $filePath, $baseNamespace, $
*/
private static function findClasses($path)
{
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait';
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
$extraTypes = self::getExtraTypes();

// Use @ here instead of Silencer to actively suppress 'unhelpful' output
// @link https://github.com/composer/composer/pull/4886
Expand All @@ -241,57 +238,14 @@ private static function findClasses($path)
}

// return early if there is no chance of matching anything in this file
if (!preg_match('{\b(?:class|interface'.$extraTypes.')\s}i', $contents)) {
preg_match_all('{\b(?:class|interface'.$extraTypes.')\s}i', $contents, $matches);
if (!$matches) {
return array();
}

// strip heredocs/nowdocs
$heredocRegex = '{
# opening heredoc/nowdoc delimiter (word-chars)
<<<[ \t]*+([\'"]?)(\w++)\\1
# needs to be followed by a newline
(?:\r\n|\n|\r)
# the meat of it, matching line by line until end delimiter
(?:
# a valid line is optional white-space (possessive match) not followed by the end delimiter, then anything goes for the rest of the line
[\t ]*+(?!\\2 \b)[^\r\n]*+
# end of line(s)
[\r\n]++
)*
# end delimiter
[\t ]*+ \\2 (?=\b)
}x';

// run first assuming the file is valid unicode
$contentWithoutHeredoc = preg_replace($heredocRegex.'u', 'null', $contents);
if (null === $contentWithoutHeredoc) {
// run again without unicode support if the file failed to be parsed
$contents = preg_replace($heredocRegex, 'null', $contents);
} else {
$contents = $contentWithoutHeredoc;
}
unset($contentWithoutHeredoc);

// strip strings
$contents = preg_replace('{"[^"\\\\]*+(\\\\.[^"\\\\]*+)*+"|\'[^\'\\\\]*+(\\\\.[^\'\\\\]*+)*+\'}s', 'null', $contents);
// strip leading non-php code if needed
if (strpos($contents, '<?') !== 0) {
$contents = preg_replace('{^.+?<\?}s', '<?', $contents, 1, $replacements);
if ($replacements === 0) {
return array();
}
}
// strip non-php blocks in the file
$contents = preg_replace('{\?>(?:[^<]++|<(?!\?))*+<\?}s', '?><?', $contents);
// strip trailing non-php code if needed
$pos = strrpos($contents, '?>');
if (false !== $pos && false === strpos(substr($contents, $pos), '<?')) {
$contents = substr($contents, 0, $pos);
}
// strip comments if short open tags are in the file
if (preg_match('{(<\?)(?!(php|hh))}i', $contents)) {
$contents = preg_replace('{//.* | /\*(?:[^*]++|\*(?!/))*\*/}x', '', $contents);
}
$p = new PhpFileCleaner($contents, count($matches[0]));
$contents = $p->clean();
unset($p);

preg_match_all('{
(?:
Expand Down Expand Up @@ -328,4 +282,18 @@ private static function findClasses($path)

return $classes;
}

private static function getExtraTypes()
{
static $extraTypes = null;
if (null === $extraTypes) {
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait';
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
PhpFileCleaner::setTypeConfig(array_merge(array('class', 'interface'), array_filter(explode('|', $extraTypes))));
}

return $extraTypes;
}
}
228 changes: 228 additions & 0 deletions src/Composer/Autoload/PhpFileCleaner.php
@@ -0,0 +1,228 @@
<?php

namespace Composer\Autoload;

/**
* @author Jordi Boggiano <j.boggiano@seld.be>
* @internal
*/
class PhpFileCleaner
{
/** @var array<array{name: string, length: int, pattern: string}> */
private static $typeConfig;
/** @var string */
private static $restPattern;

/**
* @readonly
* @var string
*/
private $contents;

/**
* @readonly
* @var int
*/
private $len;

/**
* @readonly
* @var int
*/
private $maxMatches;

/** @var int */
private $index = 0;

public static function setTypeConfig($types)
{
foreach ($types as $type) {
self::$typeConfig[$type[0]] = array(
'name' => $type,
'length' => \strlen($type),
'pattern' => '{.\b(?<![\$:>])'.$type.'\s++[a-zA-Z_\x7f-\xff:][a-zA-Z0-9_\x7f-\xff:\-]*+}Ais',
);
}

self::$restPattern = '{[^?"\'</'.implode('', array_keys(self::$typeConfig)).']+}A';
}

public function __construct($contents, $maxMatches)
{
$this->contents = $contents;
$this->len = \strlen($this->contents);
$this->maxMatches = $maxMatches;
}

public function clean()
{
$clean = '';

while ($this->index < $this->len) {
$this->skipToPhp();
$clean .= '<?';

while ($this->index < $this->len) {
$char = $this->contents[$this->index];
if ($char === '?' && $this->peek('>')) {
$clean .= '?>';
$this->index += 2;
continue 2;
}

if ($char === '"') {
$this->skipString('"');
$clean .= 'null';
continue;
}

if ($char === "'") {
$this->skipString("'");
$clean .= 'null';
continue;
}

if ($char === "<" && $this->peek('<') && $this->match('{<<<[ \t]*+([\'"]?)([a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*+)\\1(?:\r\n|\n|\r)}A', $match)) {
$this->index += \strlen($match[0]);
$this->skipHeredoc($match[2]);
$clean .= 'null';
continue;
}

if ($char === '/') {
if ($this->peek('/')) {
$this->skipToNewline();
continue;
}
if ($this->peek('*')) {
$this->skipComment();
}
}

if ($this->maxMatches === 1 && isset(self::$typeConfig[$char])) {
$type = self::$typeConfig[$char];
if (
\substr($this->contents, $this->index, $type['length']) === $type['name']
&& \preg_match($type['pattern'], $this->contents, $match, 0, $this->index - 1)
) {
$clean .= $match[0];
return $clean;
}
}

$this->index += 1;
if ($this->match(self::$restPattern, $match)) {
$clean .= $char . $match[0];
$this->index += \strlen($match[0]);
} else {
$clean .= $char;
}
}
}

return $clean;
}

private function skipToPhp()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '<' && $this->peek('?')) {
$this->index += 2;
break;
}

$this->index += 1;
}
}

private function skipString($delimiter)
{
$this->index += 1;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '\\' && ($this->peek('\\') || $this->peek($delimiter))) {
$this->index += 2;
continue;
}
if ($this->contents[$this->index] === $delimiter) {
$this->index += 1;
break;
}
$this->index += 1;
}
}

private function skipComment()
{
$this->index += 2;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '*' && $this->peek('/')) {
$this->index += 2;
break;
}

$this->index += 1;
}
}

private function skipToNewline()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n") {
return;
}
$this->index += 1;
}
}

private function skipHeredoc($delimiter)
{
$firstDelimiterChar = $delimiter[0];
$delimiterLength = \strlen($delimiter);
$delimiterPattern = '{'.preg_quote($delimiter).'(?![a-zA-Z0-9_\x80-\xff])}A';

while ($this->index < $this->len) {
// check if we find the delimiter after some spaces/tabs
switch ($this->contents[$this->index]) {
case "\t":
case " ":
$this->index += 1;
continue 2;
case $firstDelimiterChar:
if (
\substr($this->contents, $this->index, $delimiterLength) === $delimiter
&& $this->match($delimiterPattern)
) {
$this->index += $delimiterLength;
return;
}
break;
}

// skip the rest of the line
while ($this->index < $this->len) {
$this->skipToNewline();

// skip newlines
while ($this->index < $this->len && ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n")) {
$this->index += 1;
}

break;
}
}
}

private function peek($char)
{
return $this->index + 1 < $this->len && $this->contents[$this->index + 1] === $char;
}

private function match($regex, array &$match = null)
{
if (\preg_match($regex, $this->contents, $match, 0, $this->index)) {
return true;
}

return false;
}
}
7 changes: 7 additions & 0 deletions tests/Composer/Test/Autoload/Fixtures/classmap/NonUnicode.php
@@ -0,0 +1,7 @@
<?php

echo <<<'NOT¶ING_TO_SEE_H¤RE'
class FailHeredocNonUnicodeNonAscii
{
}
NOT¶ING_TO_SEE_H¤RE;
2 changes: 2 additions & 0 deletions tests/Composer/Test/Autoload/Fixtures/classmap/StripNoise.php
Expand Up @@ -19,6 +19,8 @@ class FailHeredocWhitespace
}
WHITESPACE . <<< MARKERINTEXT
In PHP < 7.3, the docblock marker could occur in the text as long as it did not occur at the very start of the line.
MARKERINTEXTwithtrail
MARKERINTEXT_
class FailHeredocMarkerInText
{
}
Expand Down