fabric8io · jcantrill · Jun 24, 2022 · Jun 9, 2022 · Jun 22, 2022 · jcantrill
diff --git a/lib/fluent/plugin/filter_kubernetes_metadata.rb b/lib/fluent/plugin/filter_kubernetes_metadata.rb
@@ -121,6 +121,12 @@ def fetch_pod_metadata(namespace_name, pod_name)
     rescue StandardError => e
       @stats.bump(:pod_cache_api_nil_error)
       log.debug "Exception '#{e}' encountered fetching pod metadata from Kubernetes API #{@apiVersion} endpoint #{@kubernetes_url}"
+      if e.message == "Unauthorized"
+        @client = nil
+        # recreate client to refresh token
+        log.info("Re-creating Kubernetes API Client to refresh auth bearer token.")
+        create_client()
+      end
       {}
     end
 
@@ -153,12 +159,20 @@ def fetch_namespace_metadata(namespace_name)
     rescue StandardError => e
       @stats.bump(:namespace_cache_api_nil_error)
       log.debug "Exception '#{e}' encountered fetching namespace metadata from Kubernetes API #{@apiVersion} endpoint #{@kubernetes_url}"
+      if e.message == "Unauthorized"
+        @client = nil
+        # recreate client to refresh token
+        log.info("Re-creating Kubernetes API Client to refresh auth bearer token.")
+        create_client()
+      end
       {}
     end
 
     def initialize
       super
       @prev_time = Time.now
+      @ssl_options = {}
+      @auth_options = {}
     end
 
     def configure(conf)
@@ -230,7 +244,7 @@ def configure(conf)
       end
 
       if present?(@kubernetes_url)
-        ssl_options = {
+        @ssl_options = {
           client_cert: present?(@client_cert) ? OpenSSL::X509::Certificate.new(File.read(@client_cert)) : nil,
           client_key: present?(@client_key) ? OpenSSL::PKey::RSA.new(File.read(@client_key)) : nil,
           ca_file: @ca_file,
@@ -249,24 +263,14 @@ def configure(conf)
                       0x80000
                     end
           ssl_store.flags = OpenSSL::X509::V_FLAG_CRL_CHECK_ALL | flagval
-          ssl_options[:cert_store] = ssl_store
+          @ssl_options[:cert_store] = ssl_store
         end
 
-        auth_options = {}
-
         if present?(@bearer_token_file)
-          bearer_token = File.read(@bearer_token_file)
-          auth_options[:bearer_token] = bearer_token
+          @auth_options[:bearer_token_file] = @bearer_token_file
         end
 
-        log.debug 'Creating K8S client'
-        @client = Kubeclient::Client.new(
-          @kubernetes_url,
-          @apiVersion,
-          ssl_options: ssl_options,
-          auth_options: auth_options,
-          as: :parsed_symbolized
-        )
+        create_client()
 
         if @test_api_adapter
           log.info "Extending client with test api adaper #{@test_api_adapter}"
@@ -305,6 +309,17 @@ def configure(conf)
       end
     end
 
+    def create_client()
+      log.debug 'Creating K8S client'
+      @client = Kubeclient::Client.new(
+        @kubernetes_url,
+        @apiVersion,
+        ssl_options: @ssl_options,
+        auth_options: @auth_options,
+        as: :parsed_symbolized
+      )
+    end
+
     def get_metadata_for_record(namespace_name, pod_name, container_name, cache_key, create_time, batch_miss_cache, docker_id)
       metadata = {
         'docker' => { 'container_id' => "" },

diff --git a/lib/fluent/plugin/kubernetes_metadata_watch_namespaces.rb b/lib/fluent/plugin/kubernetes_metadata_watch_namespaces.rb
@@ -47,29 +47,37 @@ def set_up_namespace_thread
         log.info('410 Gone encountered. Restarting namespace watch to reset resource versions.', e)
         namespace_watcher = nil
       rescue StandardError => e
-        @stats.bump(:namespace_watch_failures)
-        if Thread.current[:namespace_watch_retry_count] < @watch_retry_max_times
-          # Instead of raising exceptions and crashing Fluentd, swallow
-          # the exception and reset the watcher.
-          log.info(
-            'Exception encountered parsing namespace watch event. ' \
-            'The connection might have been closed. Sleeping for ' \
-            "#{Thread.current[:namespace_watch_retry_backoff_interval]} " \
-            'seconds and resetting the namespace watcher.', e
-          )
-          sleep(Thread.current[:namespace_watch_retry_backoff_interval])
-          Thread.current[:namespace_watch_retry_count] += 1
-          Thread.current[:namespace_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
+        if e.message == "Unauthorized"
+          @client = nil
+          # recreate client to refresh token
+          log.info("Encountered 'Unauthorized' exception in watch, recreating client to refresh token")
+          create_client()
           namespace_watcher = nil
         else
-          # Since retries failed for many times, log as errors instead
-          # of info and raise exceptions and trigger Fluentd to restart.
-          message =
-            'Exception encountered parsing namespace watch event. The ' \
-            'connection might have been closed. Retried ' \
-            "#{@watch_retry_max_times} times yet still failing. Restarting."
-          log.error(message, e)
-          raise Fluent::UnrecoverableError, message
+          @stats.bump(:namespace_watch_failures)
+          if Thread.current[:namespace_watch_retry_count] < @watch_retry_max_times
+            # Instead of raising exceptions and crashing Fluentd, swallow
+            # the exception and reset the watcher.
+            log.info(
+              'Exception encountered parsing namespace watch event. ' \
+              'The connection might have been closed. Sleeping for ' \
+              "#{Thread.current[:namespace_watch_retry_backoff_interval]} " \
+              'seconds and resetting the namespace watcher.', e
+            )
+            sleep(Thread.current[:namespace_watch_retry_backoff_interval])
+            Thread.current[:namespace_watch_retry_count] += 1
+            Thread.current[:namespace_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
+            namespace_watcher = nil
+          else
+            # Since retries failed for many times, log as errors instead
+            # of info and raise exceptions and trigger Fluentd to restart.
+            message =
+              'Exception encountered parsing namespace watch event. The ' \
+              'connection might have been closed. Retried ' \
+              "#{@watch_retry_max_times} times yet still failing. Restarting."
+            log.error(message, e)
+            raise Fluent::UnrecoverableError, message
+          end
         end
       end
     end

diff --git a/lib/fluent/plugin/kubernetes_metadata_watch_pods.rb b/lib/fluent/plugin/kubernetes_metadata_watch_pods.rb
@@ -48,29 +48,37 @@ def set_up_pod_thread
         log.info('410 Gone encountered. Restarting pod watch to reset resource versions.', e)
         pod_watcher = nil
       rescue StandardError => e
-        @stats.bump(:pod_watch_failures)
-        if Thread.current[:pod_watch_retry_count] < @watch_retry_max_times
-          # Instead of raising exceptions and crashing Fluentd, swallow
-          # the exception and reset the watcher.
-          log.info(
-            'Exception encountered parsing pod watch event. The ' \
-            'connection might have been closed. Sleeping for ' \
-            "#{Thread.current[:pod_watch_retry_backoff_interval]} " \
-            'seconds and resetting the pod watcher.', e
-          )
-          sleep(Thread.current[:pod_watch_retry_backoff_interval])
-          Thread.current[:pod_watch_retry_count] += 1
-          Thread.current[:pod_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
+        if e.message == "Unauthorized"
+          @client = nil
+          # recreate client to refresh token
+          log.info("Encountered 'Unauthorized' exception in watch, recreating client to refresh token")
+          create_client()
           pod_watcher = nil
         else
-          # Since retries failed for many times, log as errors instead
-          # of info and raise exceptions and trigger Fluentd to restart.
-          message =
-            'Exception encountered parsing pod watch event. The ' \
-            'connection might have been closed. Retried ' \
-            "#{@watch_retry_max_times} times yet still failing. Restarting."
-          log.error(message, e)
-          raise Fluent::UnrecoverableError, message
+          @stats.bump(:pod_watch_failures)
+          if Thread.current[:pod_watch_retry_count] < @watch_retry_max_times
+            # Instead of raising exceptions and crashing Fluentd, swallow
+            # the exception and reset the watcher.
+            log.info(
+              'Exception encountered parsing pod watch event. The ' \
+              'connection might have been closed. Sleeping for ' \
+              "#{Thread.current[:pod_watch_retry_backoff_interval]} " \
+              'seconds and resetting the pod watcher.', e
+            )
+            sleep(Thread.current[:pod_watch_retry_backoff_interval])
+            Thread.current[:pod_watch_retry_count] += 1
+            Thread.current[:pod_watch_retry_backoff_interval] *= @watch_retry_exponential_backoff_base
+            pod_watcher = nil
+          else
+            # Since retries failed for many times, log as errors instead
+            # of info and raise exceptions and trigger Fluentd to restart.
+            message =
+              'Exception encountered parsing pod watch event. The ' \
+              'connection might have been closed. Retried ' \
+              "#{@watch_retry_max_times} times yet still failing. Restarting."
+            log.error(message, e)
+            raise Fluent::UnrecoverableError, message
+          end
         end
       end
     end