Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow multiple 'runs' for Periscope #196

Merged
merged 12 commits into from
Aug 31, 2022
13 changes: 12 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@
"env": {},
"args": ["-test.v"],
"showLog": true
}
},
{
"name": "Launch Tests with race check",
"type": "go",
"request": "launch",
"program": "${fileDirname}",
"mode": "test",
"env": {},
"buildFlags": "-race",
"args": ["-test.v"],
"showLog": true
},
]
}
32 changes: 23 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,17 @@ secretGenerator:
- AZURE_BLOB_CONTAINER_NAME=<CONTAINER_NAME>
- AZURE_BLOB_SAS_KEY=<SAS_KEY>

# Uncomment for optional configuration (default values shown)
# configMapGenerator:
# - name: diagnostic-config
# behavior: replace
# literals:
# - DIAGNOSTIC_CONTAINERLOGS_LIST=kube-system # space-separated namespaces
# - DIAGNOSTIC_KUBEOBJECTS_LIST=kube-system/pod kube-system/service kube-system/deployment # space-separated list of namespace/resource-type[/resource]
# - DIAGNOSTIC_NODELOGS_LIST_LINUX="/var/log/azure/cluster-provision.log /var/log/cloud-init.log" # space-separated log file locations
# - DIAGNOSTIC_NODELOGS_LIST_WINDOWS="C:\AzureData\CustomDataSetupScript.log" # space-separated log file locations
# Commented-out config values are the defaults. Uncomment to change.
configMapGenerator:
- name: diagnostic-config
behavior: merge
literals:
- DIAGNOSTIC_RUN_ID=<RUN_ID>
# - DIAGNOSTIC_CONTAINERLOGS_LIST=kube-system # space-separated namespaces
# - DIAGNOSTIC_KUBEOBJECTS_LIST=kube-system/pod kube-system/service kube-system/deployment # space-separated list of namespace/resource-type[/resource]
# - DIAGNOSTIC_NODELOGS_LIST_LINUX="/var/log/azure/cluster-provision.log /var/log/cloud-init.log" # space-separated log file locations
# - DIAGNOSTIC_NODELOGS_LIST_WINDOWS="C:\AzureData\CustomDataSetupScript.log" # space-separated log file locations
# - COLLECTOR_LIST="" # space-separated list containing any of 'connectedCluster' (enables helm/pods-containerlogs, disables iptables/kubelet/nodelogs/pdb/systemlogs/systemperf), 'OSM' (enables osm/smi), 'SMI' (enables smi).
```

All placeholders in angled brackets (`<`/`>`) need to be substituted for the relevant values:
Expand All @@ -117,12 +119,24 @@ All placeholders in angled brackets (`<`/`>`) need to be substituted for the rel
- `ss`: `b` (Service: blob)
- `srt`: `sco` (Resource types: service, container and object)
- `sp`: `rlacwd` (Permissions: read, list, add, create, write, delete)
- `RUN_ID`: The identifier for a particular 'run' of Periscope, by convention a timestamp formatted as `YYYY-MM-DDThh-mm-ssZ`. This will become the topmost container within `CONTAINER_NAME`.

You can then deploy Periscope by running:
```sh
kubectl apply -k <path-to-kustomize-directory>
```

To re-run without deleting and recreating resources, you can update the `RUN_ID` value in the ConfigMap. Depending on the expiry of the SAS token, you may need to update the `AZURE_BLOB_SAS_KEY` value in the Secret first:
```sh
# Update SAS token (if expired)
sas=...
kubectl patch secret -n aks-periscope azureblob-secret -p="{\"data\":{\"AZURE_BLOB_SAS_KEY\": \"$(echo -n ?$sas | base64 -w 0)\"}}"

# Update DIAGNOSTIC_RUN_ID
runId=$(date -u '+%Y-%m-%dT%H-%M-%SZ')
kubectl patch configmap -n aks-periscope diagnostic-config -p="{\"data\":{\"DIAGNOSTIC_RUN_ID\": \"$runId\"}}"
```

### Using Azure Command-Line tool

AKS Periscope can be deployed by using Azure Command-Line tool (CLI). The steps are:
Expand Down
72 changes: 50 additions & 22 deletions cmd/aks-periscope/aks-periscope.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package main

import (
"bytes"
"fmt"
"log"
"runtime"
"sync"
"time"

Expand All @@ -15,56 +17,87 @@ import (
)

func main() {
config, err := restclient.InClusterConfig()
osIdentifier, err := utils.StringToOSIdentifier(runtime.GOOS)
if err != nil {
log.Fatalf("Cannot load kubeconfig: %v", err)
log.Fatalf("cannot determine OS: %v", err)
}

creationTimeStamp, err := utils.GetCreationTimeStamp(config)
knownFilePaths, err := utils.GetKnownFilePaths(osIdentifier)
if err != nil {
log.Fatalf("Failed to get creation timestamp: %v", err)
log.Fatalf("failed to get file paths: %v", err)
}

runtimeInfo, err := utils.GetRuntimeInfo()
fileSystem := utils.NewFileSystem()

// Create a watcher for the Run ID file that checks its content every 10 seconds
fileWatcher := utils.NewFileContentWatcher(fileSystem, 10*time.Second)

// Create a channel for unrecoverable errors
errChan := make(chan error)

// Add a watcher for the run ID file content
runIdChan := make(chan string)
fileWatcher.AddHandler(knownFilePaths.GetConfigPath(utils.RunIdKey), runIdChan, errChan)

go func() {
for {
runId := <-runIdChan
log.Printf("Starting Periscope run %s", runId)
err := run(osIdentifier, knownFilePaths, fileSystem)
if err != nil {
errChan <- err
}

log.Printf("Completed Periscope run %s", runId)
}
}()

fileWatcher.Start()

// Run until unrecoverable error
err = <-errChan
log.Fatalf("Error running Periscope: %v", err)
}

func run(osIdentifier utils.OSIdentifier, knownFilePaths *utils.KnownFilePaths, fileSystem interfaces.FileSystemAccessor) error {
runtimeInfo, err := utils.GetRuntimeInfo(fileSystem, knownFilePaths)
if err != nil {
log.Fatalf("Failed to get runtime information: %v", err)
}

knownFilePaths, err := utils.GetKnownFilePaths(runtimeInfo)
config, err := restclient.InClusterConfig()
if err != nil {
log.Fatalf("Failed to get file paths: %v", err)
return fmt.Errorf("cannot load kubeconfig: %w", err)
}

exp := exporter.NewAzureBlobExporter(runtimeInfo, knownFilePaths, creationTimeStamp)
exp := exporter.NewAzureBlobExporter(runtimeInfo, knownFilePaths, runtimeInfo.RunId)

// Copies self-signed cert information to container if application is running on Azure Stack Cloud.
// We need the cert in order to communicate with the storage account.
if utils.IsAzureStackCloud(knownFilePaths) {
if err := utils.CopyFile(knownFilePaths.AzureStackCertHost, knownFilePaths.AzureStackCertContainer); err != nil {
log.Fatalf("Cannot copy cert for Azure Stack Cloud environment: %v", err)
return fmt.Errorf("cannot copy cert for Azure Stack Cloud environment: %w", err)
}
}

fileSystem := utils.NewFileSystem()

dnsCollector := collector.NewDNSCollector(runtimeInfo, knownFilePaths, fileSystem)
kubeletCmdCollector := collector.NewKubeletCmdCollector(runtimeInfo)
dnsCollector := collector.NewDNSCollector(osIdentifier, knownFilePaths, fileSystem)
kubeletCmdCollector := collector.NewKubeletCmdCollector(osIdentifier, runtimeInfo)
networkOutboundCollector := collector.NewNetworkOutboundCollector()
collectors := []interfaces.Collector{
dnsCollector,
kubeletCmdCollector,
networkOutboundCollector,
collector.NewHelmCollector(config, runtimeInfo),
collector.NewIPTablesCollector(runtimeInfo),
collector.NewIPTablesCollector(osIdentifier, runtimeInfo),
collector.NewKubeObjectsCollector(config, runtimeInfo),
collector.NewNodeLogsCollector(runtimeInfo, fileSystem),
collector.NewOsmCollector(config, runtimeInfo),
collector.NewPDBCollector(config, runtimeInfo),
collector.NewPodsContainerLogsCollector(config, runtimeInfo),
collector.NewSmiCollector(config, runtimeInfo),
collector.NewSystemLogsCollector(runtimeInfo),
collector.NewSystemLogsCollector(osIdentifier, runtimeInfo),
collector.NewSystemPerfCollector(config, runtimeInfo),
collector.NewWindowsLogsCollector(runtimeInfo, knownFilePaths, fileSystem, 10*time.Second, 20*time.Minute),
collector.NewWindowsLogsCollector(osIdentifier, runtimeInfo, knownFilePaths, fileSystem, 10*time.Second, 20*time.Minute),
}

collectorGrp := new(sync.WaitGroup)
Expand Down Expand Up @@ -136,10 +169,5 @@ func main() {
}
}

// TODO: Hack: for now AKS-Periscope is running as a deamonset so it shall not stop (or the pod will be restarted)
// Revert from https://github.com/Azure/aks-periscope/blob/b98d66a238e942158ef2628a9315b58937ff9c8f/cmd/aks-periscope/aks-periscope.go#L70
select {}

// TODO: remove this //nolint comment once the select{} has been removed
//nolint:govet
return nil
}
30 changes: 20 additions & 10 deletions deployment/base/daemon-set.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
envFrom:
- configMapRef:
name: diagnostic-config
- secretRef:
name: azureblob-secret
volumeMounts:
- name: diag-config-volume
mountPath: /config
- name: storage-secret-volume
mountPath: /secret
- name: varlog
mountPath: /var/log
- name: resolvlog
Expand All @@ -48,6 +47,12 @@ spec:
memory: "2000Mi"
cpu: "1000m"
volumes:
- name: diag-config-volume
configMap:
name: diagnostic-config
- name: storage-secret-volume
secret:
secretName: azureblob-secret
- name: varlog
hostPath:
path: /var/log
Expand Down Expand Up @@ -86,12 +91,11 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
envFrom:
- configMapRef:
name: diagnostic-config
- secretRef:
name: azureblob-secret
volumeMounts:
- name: diag-config-volume
mountPath: /config
- name: storage-secret-volume
mountPath: /secret
- name: k
mountPath: /k
- name: azuredata
Expand All @@ -104,6 +108,12 @@ spec:
memory: "2000Mi"
cpu: "1000m"
volumes:
- name: diag-config-volume
configMap:
name: diagnostic-config
- name: storage-secret-volume
secret:
secretName: azureblob-secret
- name: k
hostPath:
path: /k
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/containerd/containerd v1.4.13 // indirect
github.com/docker/docker v20.10.14+incompatible
github.com/google/uuid v1.2.0
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/onsi/gomega v1.13.0 // indirect
helm.sh/helm/v3 v3.6.3
k8s.io/api v0.21.3
Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,14 @@ github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBt
github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE=
github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
github.com/hashicorp/consul/sdk v0.3.0/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
Expand Down
23 changes: 8 additions & 15 deletions pkg/collector/dns_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package collector

import (
"fmt"
"io/ioutil"
"io"

"github.com/Azure/aks-periscope/pkg/interfaces"
"github.com/Azure/aks-periscope/pkg/utils"
Expand All @@ -12,17 +12,17 @@ import (
type DNSCollector struct {
HostConf string
ContainerConf string
runtimeInfo *utils.RuntimeInfo
osIdentifier utils.OSIdentifier
filePaths *utils.KnownFilePaths
fileSystem interfaces.FileSystemAccessor
}

// NewDNSCollector is a constructor
func NewDNSCollector(runtimeInfo *utils.RuntimeInfo, filePaths *utils.KnownFilePaths, fileSystem interfaces.FileSystemAccessor) *DNSCollector {
func NewDNSCollector(osIdentifier utils.OSIdentifier, filePaths *utils.KnownFilePaths, fileSystem interfaces.FileSystemAccessor) *DNSCollector {
return &DNSCollector{
HostConf: "",
ContainerConf: "",
runtimeInfo: runtimeInfo,
osIdentifier: osIdentifier,
filePaths: filePaths,
fileSystem: fileSystem,
}
Expand All @@ -36,8 +36,8 @@ func (collector *DNSCollector) CheckSupported() error {
// NOTE: This *might* be achievable in Windows using APIs that query the registry, see:
// https://kubernetes.io/docs/setup/production-environment/windows/intro-windows-in-kubernetes/#networking
// But for now it's restricted to Linux containers only, in which we can read `resolv.conf`.
if collector.runtimeInfo.OSIdentifier != "linux" {
return fmt.Errorf("unsupported OS: %s", collector.runtimeInfo.OSIdentifier)
if collector.osIdentifier != utils.Linux {
return fmt.Errorf("unsupported OS: %s", collector.osIdentifier)
}

return nil
Expand All @@ -52,19 +52,12 @@ func (collector *DNSCollector) Collect() error {
}

func (collector *DNSCollector) getConfFileContent(filePath string) string {
reader, err := collector.fileSystem.GetFileReader(filePath)
content, err := utils.GetContent(func() (io.ReadCloser, error) { return collector.fileSystem.GetFileReader(filePath) })
if err != nil {
return err.Error()
}

defer reader.Close()

content, err := ioutil.ReadAll(reader)
if err != nil {
return err.Error()
}

return string(content)
return content
}

func (collector *DNSCollector) GetData() map[string]interfaces.DataValue {
Expand Down