Merge pull request #82 from stecman/string-arguments

Rewrite command-line splitting to tokenize quoted strings
stecman · Apr 26, 2019 · bc095fe · bc095fe
2 parents bd07a24 + cbec6e8
commit bc095fe
Show file tree

Hide file tree

Showing 6 changed files with 374 additions and 51 deletions.
diff --git a/src/CompletionCommand.php b/src/CompletionCommand.php
@@ -121,7 +121,54 @@ protected function execute(InputInterface $input, OutputInterface $output)
             $output->write($hook, true);
         } else {
             $handler->setContext(new EnvironmentCompletionContext());
-            $output->write($this->runCompletion(), true);
+
+            // Get completion results
+            $results = $this->runCompletion();
+
+            // Escape results for the current shell
+            $shellType = $input->getOption('shell-type') ?: $this->getShellType();
+
+            foreach ($results as &$result) {
+                $result = $this->escapeForShell($result, $shellType);
+            }
+
+            $output->write($results, true);
+        }
+    }
+
+    /**
+     * Escape each completion result for the specified shell
+     *
+     * @param string $result - Completion results that should appear in the shell
+     * @param string $shellType - Valid shell type from HookFactory
+     * @return string
+     */
+    protected function escapeForShell($result, $shellType)
+    {
+        switch ($shellType) {
+            // BASH requires special escaping for multi-word and special character results
+            // This emulates registering completion with`-o filenames`, without side-effects like dir name slashes
+            case 'bash':
+                $context = $this->handler->getContext();
+                $wordStart = substr($context->getRawCurrentWord(), 0, 1);
+
+                if ($wordStart == "'") {
+                    // If the current word is single-quoted, escape any single quotes in the result
+                    $result = str_replace("'", "\\'", $result);
+                } else if ($wordStart == '"') {
+                    // If the current word is double-quoted, escape any double quotes in the result
+                    $result = str_replace('"', '\\"', $result);
+                } else {
+                    // Otherwise assume the string is unquoted and word breaks should be escaped
+                    $result = preg_replace('/([\s\'"\\\\])/', '\\\\$1', $result);
+                }
+
+                // Escape output to prevent special characters being lost when passing results to compgen
+                return escapeshellarg($result);
+
+            // No transformation by default
+            default:
+                return $result;
         }
     }
 

diff --git a/src/CompletionContext.php b/src/CompletionContext.php
@@ -32,17 +32,27 @@ class CompletionContext
     protected $charIndex = 0;
 
     /**
-     * An array containing the individual words in the current command line.
+     * An array of the individual words in the current command line.
      *
      * This is not set until $this->splitCommand() is called, when it is populated by
      * $commandLine exploded by $wordBreaks
      *
      * Bash equivalent: COMP_WORDS
      *
-     * @var array|null
+     * @var string[]|null
      */
     protected $words = null;
 
+    /**
+     * Words from the currently command-line before quotes and escaping is processed
+     *
+     * This is indexed the same as $this->words, but in their raw input terms are in their input form, including
+     * quotes and escaping.
+     *
+     * @var string[]|null
+     */
+    protected $rawWords = null;
+
     /**
      * The index in $this->words containing the word at the current cursor position.
      *
@@ -61,7 +71,7 @@ class CompletionContext
      *
      * @var string
      */
-    protected $wordBreaks = "'\"()= \t\n";
+    protected $wordBreaks = "= \t\n";
 
     /**
      * Set the whole contents of the command line as a string
@@ -101,6 +111,22 @@ public function getCurrentWord()
         return '';
     }
 
+    /**
+     * Return the unprocessed string for the word under the cursor
+     *
+     * This preserves any quotes and escaping that are present in the input command line.
+     *
+     * @return string
+     */
+    public function getRawCurrentWord()
+    {
+        if (isset($this->rawWords[$this->wordIndex])) {
+            return $this->rawWords[$this->wordIndex];
+        }
+
+        return '';
+    }
+
     /**
      * Return a word by index from the command line
      *
@@ -132,6 +158,22 @@ public function getWords()
         return $this->words;
     }
 
+    /**
+     * Get the unprocessed/literal words from the command line
+     *
+     * This is indexed the same as getWords(), but preserves any quoting and escaping from the command line
+     *
+     * @return string[]
+     */
+    public function getRawWords()
+    {
+        if ($this->rawWords === null) {
+            $this->splitCommand();
+        }
+
+        return $this->rawWords;
+    }
+
     /**
      * Get the index of the word the cursor is currently in
      *
@@ -178,12 +220,15 @@ public function setCharIndex($index)
      * This defaults to a sane value based on BASH's word break characters and shouldn't
      * need to be changed unless your completions contain the default word break characters.
      *
+     * @deprecated This is becoming an internal setting that doesn't make sense to expose publicly.
+     *
      * @see wordBreaks
      * @param string $charList - a single string containing all of the characters to break words on
      */
     public function setWordBreaks($charList)
     {
-        $this->wordBreaks = $charList;
+        // Drop quotes from break characters - strings are handled separately to word breaks now
+        $this->wordBreaks = str_replace(array('"', '\''), '', $charList);;
         $this->reset();
     }
 
@@ -194,57 +239,146 @@ public function setWordBreaks($charList)
      */
     protected function splitCommand()
     {
-        $this->words = array();
-        $this->wordIndex = null;
-        $cursor = 0;
+        $tokens = $this->tokenizeString($this->commandLine);
 
-        $breaks = preg_quote($this->wordBreaks);
-
-        if (!preg_match_all("/([^$breaks]*)([$breaks]*)/", $this->commandLine, $matches)) {
-            return;
-        }
-
-        // Groups:
-        // 1: Word
-        // 2: Break characters
-        foreach ($matches[0] as $index => $wholeMatch) {
-            // Determine which word the cursor is in
-            $cursor += strlen($wholeMatch);
-            $word = $matches[1][$index];
-            $breaks = $matches[2][$index];
-
-            if ($this->wordIndex === null && $cursor >= $this->charIndex) {
-                $this->wordIndex = $index;
-
-                // Find the user's cursor position relative to the end of this word
-                // The end of the word is the internal cursor minus any break characters that were captured
-                $cursorWordOffset = $this->charIndex - ($cursor - strlen($breaks));
+        foreach ($tokens as $token) {
+            if ($token['type'] != 'break') {
+                $this->words[] = $this->getTokenValue($token);
+                $this->rawWords[] = $token['value'];
+            }
 
-                if ($cursorWordOffset < 0) {
-                    // Cursor is inside the word - truncate the word at the cursor
-                    // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
-                    $word = substr($word, 0, strlen($word) + $cursorWordOffset);
+            // Determine which word index the cursor is inside once we reach it's offset
+            if ($this->wordIndex === null && $this->charIndex <= $token['offsetEnd']) {
+                $this->wordIndex = count($this->words) - 1;
 
-                } elseif ($cursorWordOffset > 0) {
+                if ($token['type'] == 'break') {
                     // Cursor is in the break-space after a word
                     // Push an empty word at the cursor to allow completion of new terms at the cursor, ignoring words ahead
                     $this->wordIndex++;
-                    $this->words[] = $word;
                     $this->words[] = '';
+                    $this->rawWords[] = '';
                     continue;
                 }
-            }
 
-            if ($word !== '') {
-                $this->words[] = $word;
+                if ($this->charIndex < $token['offsetEnd']) {
+                    // Cursor is inside the current word - truncate the word at the cursor to complete on
+                    // This emulates BASH completion's behaviour with COMP_CWORD
+
+                    // Create a copy of the token with its value truncated
+                    $truncatedToken = $token;
+                    $relativeOffset = $this->charIndex - $token['offset'];
+                    $truncatedToken['value'] = substr($token['value'], 0, $relativeOffset);
+
+                    // Replace the current word with the truncated value
+                    $this->words[$this->wordIndex] = $this->getTokenValue($truncatedToken);
+                    $this->rawWords[$this->wordIndex] = $truncatedToken['value'];
+                }
             }
         }
 
-        if ($this->wordIndex > count($this->words) - 1) {
-            $this->wordIndex = count($this->words) - 1;
+        // Cursor position is past the end of the command line string - consider it a new word
+        if ($this->wordIndex === null) {
+            $this->wordIndex = count($this->words);
+            $this->words[] = '';
+            $this->rawWords[] = '';
         }
     }
 
+    /**
+     * Return a token's value with escaping and quotes removed
+     *
+     * @see self::tokenizeString()
+     * @param array $token
+     * @return string
+     */
+    protected function getTokenValue($token)
+    {
+        $value = $token['value'];
+
+        // Remove outer quote characters (or first quote if unclosed)
+        if ($token['type'] == 'quoted') {
+            $value = preg_replace('/^(?:[\'"])(.*?)(?:[\'"])?$/', '$1', $value);
+        }
+
+        // Remove escape characters
+        $value = preg_replace('/\\\\(.)/', '$1', $value);
+
+        return $value;
+    }
+
+    /**
+     * Break a string into words, quoted strings and non-words (breaks)
+     *
+     * Returns an array of unmodified segments of $string with offset and type information.
+     *
+     * @param string $string
+     * @return array as [ [type => string, value => string, offset => int], ... ]
+     */
+    protected function tokenizeString($string)
+    {
+        // Map capture groups to returned token type
+        $typeMap = array(
+            'double_quote_string' => 'quoted',
+            'single_quote_string' => 'quoted',
+            'word' => 'word',
+            'break' => 'break',
+        );
+
+        // Escape every word break character including whitespace
+        // preg_quote won't work here as it doesn't understand the ignore whitespace flag ("x")
+        $breaks = preg_replace('/(.)/', '\\\$1', $this->wordBreaks);
+
+        $pattern = <<<"REGEX"
+            /(?:
+                (?P<double_quote_string>
+                    "(\\\\.|[^\"\\\\])*(?:"|$)
+                ) |
+                (?P<single_quote_string>
+                    '(\\\\.|[^'\\\\])*(?:'|$)
+                ) |
+                (?P<word>
+                    (?:\\\\.|[^$breaks])+
+                ) |
+                (?P<break>
+                     [$breaks]+
+                )
+            )/x
+REGEX;
+
+        $tokens = array();
+
+        if (!preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER)) {
+            return $tokens;
+        }
+
+        foreach ($matches as $set) {
+            foreach ($set as $groupName => $match) {
+
+                // Ignore integer indices preg_match outputs (duplicates of named groups)
+                if (is_integer($groupName)) {
+                    continue;
+                }
+
+                // Skip if the offset indicates this group didn't match
+                if ($match[1] === -1) {
+                    continue;
+                }
+
+                $tokens[] = array(
+                    'type' => $typeMap[$groupName],
+                    'value' => $match[0],
+                    'offset' => $match[1],
+                    'offsetEnd' => $match[1] + strlen($match[0])
+                );
+
+                // Move to the next set (only one group should match per set)
+                continue;
+            }
+        }
+
+        return $tokens;
+    }
+
     /**
      * Reset the computed words so that $this->splitWords is forced to run again
      */