Skip to content

Commit

Permalink
Merge pull request #82 from stecman/string-arguments
Browse files Browse the repository at this point in the history
Rewrite command-line splitting to tokenize quoted strings
  • Loading branch information
stecman committed Apr 26, 2019
2 parents bd07a24 + cbec6e8 commit bc095fe
Show file tree
Hide file tree
Showing 6 changed files with 374 additions and 51 deletions.
49 changes: 48 additions & 1 deletion src/CompletionCommand.php
Expand Up @@ -121,7 +121,54 @@ protected function execute(InputInterface $input, OutputInterface $output)
$output->write($hook, true);
} else {
$handler->setContext(new EnvironmentCompletionContext());
$output->write($this->runCompletion(), true);

// Get completion results
$results = $this->runCompletion();

// Escape results for the current shell
$shellType = $input->getOption('shell-type') ?: $this->getShellType();

foreach ($results as &$result) {
$result = $this->escapeForShell($result, $shellType);
}

$output->write($results, true);
}
}

/**
* Escape each completion result for the specified shell
*
* @param string $result - Completion results that should appear in the shell
* @param string $shellType - Valid shell type from HookFactory
* @return string
*/
protected function escapeForShell($result, $shellType)
{
switch ($shellType) {
// BASH requires special escaping for multi-word and special character results
// This emulates registering completion with`-o filenames`, without side-effects like dir name slashes
case 'bash':
$context = $this->handler->getContext();
$wordStart = substr($context->getRawCurrentWord(), 0, 1);

if ($wordStart == "'") {
// If the current word is single-quoted, escape any single quotes in the result
$result = str_replace("'", "\\'", $result);
} else if ($wordStart == '"') {
// If the current word is double-quoted, escape any double quotes in the result
$result = str_replace('"', '\\"', $result);
} else {
// Otherwise assume the string is unquoted and word breaks should be escaped
$result = preg_replace('/([\s\'"\\\\])/', '\\\\$1', $result);
}

// Escape output to prevent special characters being lost when passing results to compgen
return escapeshellarg($result);

// No transformation by default
default:
return $result;
}
}

Expand Down
212 changes: 173 additions & 39 deletions src/CompletionContext.php
Expand Up @@ -32,17 +32,27 @@ class CompletionContext
protected $charIndex = 0;

/**
* An array containing the individual words in the current command line.
* An array of the individual words in the current command line.
*
* This is not set until $this->splitCommand() is called, when it is populated by
* $commandLine exploded by $wordBreaks
*
* Bash equivalent: COMP_WORDS
*
* @var array|null
* @var string[]|null
*/
protected $words = null;

/**
* Words from the currently command-line before quotes and escaping is processed
*
* This is indexed the same as $this->words, but in their raw input terms are in their input form, including
* quotes and escaping.
*
* @var string[]|null
*/
protected $rawWords = null;

/**
* The index in $this->words containing the word at the current cursor position.
*
Expand All @@ -61,7 +71,7 @@ class CompletionContext
*
* @var string
*/
protected $wordBreaks = "'\"()= \t\n";
protected $wordBreaks = "= \t\n";

/**
* Set the whole contents of the command line as a string
Expand Down Expand Up @@ -101,6 +111,22 @@ public function getCurrentWord()
return '';
}

/**
* Return the unprocessed string for the word under the cursor
*
* This preserves any quotes and escaping that are present in the input command line.
*
* @return string
*/
public function getRawCurrentWord()
{
if (isset($this->rawWords[$this->wordIndex])) {
return $this->rawWords[$this->wordIndex];
}

return '';
}

/**
* Return a word by index from the command line
*
Expand Down Expand Up @@ -132,6 +158,22 @@ public function getWords()
return $this->words;
}

/**
* Get the unprocessed/literal words from the command line
*
* This is indexed the same as getWords(), but preserves any quoting and escaping from the command line
*
* @return string[]
*/
public function getRawWords()
{
if ($this->rawWords === null) {
$this->splitCommand();
}

return $this->rawWords;
}

/**
* Get the index of the word the cursor is currently in
*
Expand Down Expand Up @@ -178,12 +220,15 @@ public function setCharIndex($index)
* This defaults to a sane value based on BASH's word break characters and shouldn't
* need to be changed unless your completions contain the default word break characters.
*
* @deprecated This is becoming an internal setting that doesn't make sense to expose publicly.
*
* @see wordBreaks
* @param string $charList - a single string containing all of the characters to break words on
*/
public function setWordBreaks($charList)
{
$this->wordBreaks = $charList;
// Drop quotes from break characters - strings are handled separately to word breaks now
$this->wordBreaks = str_replace(array('"', '\''), '', $charList);;
$this->reset();
}

Expand All @@ -194,57 +239,146 @@ public function setWordBreaks($charList)
*/
protected function splitCommand()
{
$this->words = array();
$this->wordIndex = null;
$cursor = 0;
$tokens = $this->tokenizeString($this->commandLine);

$breaks = preg_quote($this->wordBreaks);

if (!preg_match_all("/([^$breaks]*)([$breaks]*)/", $this->commandLine, $matches)) {
return;
}

// Groups:
// 1: Word
// 2: Break characters
foreach ($matches[0] as $index => $wholeMatch) {
// Determine which word the cursor is in
$cursor += strlen($wholeMatch);
$word = $matches[1][$index];
$breaks = $matches[2][$index];

if ($this->wordIndex === null && $cursor >= $this->charIndex) {
$this->wordIndex = $index;

// Find the user's cursor position relative to the end of this word
// The end of the word is the internal cursor minus any break characters that were captured
$cursorWordOffset = $this->charIndex - ($cursor - strlen($breaks));
foreach ($tokens as $token) {
if ($token['type'] != 'break') {
$this->words[] = $this->getTokenValue($token);
$this->rawWords[] = $token['value'];
}

if ($cursorWordOffset < 0) {
// Cursor is inside the word - truncate the word at the cursor
// (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
$word = substr($word, 0, strlen($word) + $cursorWordOffset);
// Determine which word index the cursor is inside once we reach it's offset
if ($this->wordIndex === null && $this->charIndex <= $token['offsetEnd']) {
$this->wordIndex = count($this->words) - 1;

} elseif ($cursorWordOffset > 0) {
if ($token['type'] == 'break') {
// Cursor is in the break-space after a word
// Push an empty word at the cursor to allow completion of new terms at the cursor, ignoring words ahead
$this->wordIndex++;
$this->words[] = $word;
$this->words[] = '';
$this->rawWords[] = '';
continue;
}
}

if ($word !== '') {
$this->words[] = $word;
if ($this->charIndex < $token['offsetEnd']) {
// Cursor is inside the current word - truncate the word at the cursor to complete on
// This emulates BASH completion's behaviour with COMP_CWORD

// Create a copy of the token with its value truncated
$truncatedToken = $token;
$relativeOffset = $this->charIndex - $token['offset'];
$truncatedToken['value'] = substr($token['value'], 0, $relativeOffset);

// Replace the current word with the truncated value
$this->words[$this->wordIndex] = $this->getTokenValue($truncatedToken);
$this->rawWords[$this->wordIndex] = $truncatedToken['value'];
}
}
}

if ($this->wordIndex > count($this->words) - 1) {
$this->wordIndex = count($this->words) - 1;
// Cursor position is past the end of the command line string - consider it a new word
if ($this->wordIndex === null) {
$this->wordIndex = count($this->words);
$this->words[] = '';
$this->rawWords[] = '';
}
}

/**
* Return a token's value with escaping and quotes removed
*
* @see self::tokenizeString()
* @param array $token
* @return string
*/
protected function getTokenValue($token)
{
$value = $token['value'];

// Remove outer quote characters (or first quote if unclosed)
if ($token['type'] == 'quoted') {
$value = preg_replace('/^(?:[\'"])(.*?)(?:[\'"])?$/', '$1', $value);
}

// Remove escape characters
$value = preg_replace('/\\\\(.)/', '$1', $value);

return $value;
}

/**
* Break a string into words, quoted strings and non-words (breaks)
*
* Returns an array of unmodified segments of $string with offset and type information.
*
* @param string $string
* @return array as [ [type => string, value => string, offset => int], ... ]
*/
protected function tokenizeString($string)
{
// Map capture groups to returned token type
$typeMap = array(
'double_quote_string' => 'quoted',
'single_quote_string' => 'quoted',
'word' => 'word',
'break' => 'break',
);

// Escape every word break character including whitespace
// preg_quote won't work here as it doesn't understand the ignore whitespace flag ("x")
$breaks = preg_replace('/(.)/', '\\\$1', $this->wordBreaks);

$pattern = <<<"REGEX"
/(?:
(?P<double_quote_string>
"(\\\\.|[^\"\\\\])*(?:"|$)
) |
(?P<single_quote_string>
'(\\\\.|[^'\\\\])*(?:'|$)
) |
(?P<word>
(?:\\\\.|[^$breaks])+
) |
(?P<break>
[$breaks]+
)
)/x
REGEX;
$tokens = array();
if (!preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
return $tokens;
}
foreach ($matches as $set) {
foreach ($set as $groupName => $match) {
// Ignore integer indices preg_match outputs (duplicates of named groups)
if (is_integer($groupName)) {
continue;
}
// Skip if the offset indicates this group didn't match
if ($match[1] === -1) {
continue;
}

$tokens[] = array(
'type' => $typeMap[$groupName],
'value' => $match[0],
'offset' => $match[1],
'offsetEnd' => $match[1] + strlen($match[0])
);

// Move to the next set (only one group should match per set)
continue;
}
}

return $tokens;
}

/**
* Reset the computed words so that $this->splitWords is forced to run again
*/
Expand Down

0 comments on commit bc095fe

Please sign in to comment.