Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Operator refactor - validation and import cleanup #1356

Merged
merged 5 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 7 additions & 30 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"math/rand"
"net/http"
"net/url"
"os"
"sync"
"time"

Expand Down Expand Up @@ -104,24 +103,10 @@ type defaultReportingOperator struct {
importers map[string]*prestostore.PrometheusImporter
}

func (cfg *TLSConfig) Valid() error {
if cfg.UseTLS {
if cfg.TLSCert == "" {
return fmt.Errorf("Must set TLS certificate if TLS is enabled")
}
if cfg.TLSKey == "" {
return fmt.Errorf("Must set TLS private key if TLS is enabled")
}
}
return nil
}

func New(logger log.FieldLogger, cfg Config) (ReportingOperator, error) {
timflannagan marked this conversation as resolved.
Show resolved Hide resolved
if err := cfg.APITLSConfig.Valid(); err != nil {
return nil, err
}
if err := cfg.MetricsTLSConfig.Valid(); err != nil {
return nil, err
errs := IsValidConfig(&cfg)
if errs != nil {
return nil, errs
}
Comment on lines +107 to 110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Building off my comment from above, I wonder if we should be doing all of the reporting configuration validation in the cmd/reporting-operator driver function that provides the entry point for this operator, rather than in the constructor.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it can be called from the cmd package but it should not be done there. Commands in Kube/OpenShift follow a pattern of Complete, Validate, Run. We should refactor ours to match and call the IsValidConfig method in the command's Validate method. However we should keep it in new as well to protect against programmatic access to the operator no matter how it's started in the future.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


logger.Debugf("config: %s", spew.Sprintf("%+v", cfg))
Expand Down Expand Up @@ -169,15 +154,11 @@ func New(logger log.FieldLogger, cfg Config) (ReportingOperator, error) {
return nil, fmt.Errorf("Unable to create Metering client: %v", err)
}

var informerNamespace string
informerNamespace := cfg.OwnNamespace
if cfg.AllNamespaces {
informerNamespace = metav1.NamespaceAll
} else if len(cfg.TargetNamespaces) == 1 {
informerNamespace = cfg.TargetNamespaces[0]
} else if len(cfg.TargetNamespaces) > 1 && !cfg.AllNamespaces {
return nil, fmt.Errorf("must set --all-namespaces if more than one namespace is passed to --target-namespaces")
} else {
informerNamespace = cfg.OwnNamespace
}

clock := clock.RealClock{}
Expand Down Expand Up @@ -658,13 +639,9 @@ func (op *defaultReportingOperator) Run(ctx context.Context) error {
func (op *defaultReportingOperator) newPrometheusConnFromURL(url string) (prom.API, error) {
transportConfig := &transport.Config{}
if op.cfg.PrometheusConfig.CAFile != "" {
if _, err := os.Stat(op.cfg.PrometheusConfig.CAFile); err == nil {
// Use the configured CA for communicating to Prometheus
transportConfig.TLS.CAFile = op.cfg.PrometheusConfig.CAFile
op.logger.Infof("using %s as CA for Prometheus", op.cfg.PrometheusConfig.CAFile)
} else {
return nil, err
}
// Use the configured CA for communicating to Prometheus
transportConfig.TLS.CAFile = op.cfg.PrometheusConfig.CAFile
op.logger.Infof("using %s as CA for Prometheus", op.cfg.PrometheusConfig.CAFile)
} else {
op.logger.Infof("using system CAs for Prometheus")
transportConfig.TLS.CAData = nil
Expand Down
178 changes: 178 additions & 0 deletions pkg/operator/validation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package operator

import (
"fmt"
"net"
"os"

"k8s.io/apimachinery/pkg/util/errors"
)

// IsValidConfig checks the validity of all configuration options.
func IsValidConfig(cfg *Config) error {
errs := []error{}

errs = append(errs, IsValidNamespaceConfig(cfg))
errs = append(errs, IsValidListenConfig(cfg))
errs = append(errs, IsValidPrestoConfig(cfg))
errs = append(errs, IsValidHiveConfig(cfg))
errs = append(errs, IsValidKubeConfig(cfg.Kubeconfig))
errs = append(errs, IsValidPrometheusConfig(cfg))

if err := isValidTLSConfig(&cfg.APITLSConfig); err != nil {
errs = append(errs, fmt.Errorf("error validating apiTLSConfig: %s", err.Error()))
}

if err := isValidTLSConfig(&cfg.MetricsTLSConfig); err != nil {
errs = append(errs, fmt.Errorf("error validating metricsTLSConfig: %s", err.Error()))
}

if len(errs) != 0 {
return errors.NewAggregate(errs)
}
return nil
}

// IsValidNamespaceConfig ensures that if you are using target namespaces the all namespace field is correct.
func IsValidNamespaceConfig(cfg *Config) error {
if len(cfg.TargetNamespaces) > 1 && !cfg.AllNamespaces {
return fmt.Errorf("must set allNamespaces if more than one namespace is passed to targetNamespaces")
}
return nil
}

// IsValidListenConfig ensures all *Listen fields are set to valid host/ports if they have a value set.
func IsValidListenConfig(cfg *Config) error {
errs := []error{}

errs = append(errs, isValidHostPort(cfg.APIListen, "apiListen"))
errs = append(errs, isValidHostPort(cfg.MetricsListen, "metricsListen"))
errs = append(errs, isValidHostPort(cfg.PprofListen, "pprofListen"))

if len(errs) > 0 {
return errors.NewAggregate(errs)
}
return nil
}

// IsValidPrestoConfig ensure all Presto* fields are valid if provided.
func IsValidPrestoConfig(cfg *Config) error {
errs := []error{}

errs = append(errs, isValidHostPort(cfg.PrestoHost, "prestoHost"))

if !cfg.PrestoUseTLS {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previously we allowed config to be set even if this was set to false. Should we continue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would imagine that makes sense, i.e. rejecting a configuration where you explicitly set TLS-related flags yet disable TLS entirely for the reporting-operator -> presto communications.

if cfg.PrestoUseClientCertAuth {
errs = append(errs, fmt.Errorf("prestoUseClientCertAuth cannot be set to true if prestoUseTLS is false"))
}

if len(cfg.PrestoCAFile) > 0 {
errs = append(errs, fmt.Errorf("prestoCAFile cannot be set if prestoUseTLS is false"))
}
}

if len(cfg.PrestoCAFile) > 0 {
if _, err := os.Stat(cfg.PrestoCAFile); err != nil {
errs = append(errs, err)
}
}

if (len(cfg.PrestoClientCertFile) > 0 && len(cfg.PrestoClientKeyFile) == 0) ||
(len(cfg.PrestoClientKeyFile) > 0 && len(cfg.PrestoClientCertFile) == 0) {
errs = append(errs, fmt.Errorf("prestoClientCertFile and prestoClientKeyFile must both be specified or neither specified"))
}

if len(errs) > 0 {
return errors.NewAggregate(errs)
}
return nil
}

// IsValidHiveConfig ensure all Hive* fields are valid if provided.
func IsValidHiveConfig(cfg *Config) error {
errs := []error{}

errs = append(errs, isValidHostPort(cfg.HiveHost, "hiveHost"))

if !cfg.HiveUseTLS {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previously we allowed config to be set even if this was set to false. Should we continue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as my comment below. I think this is fine UX, reject any configurations like this if you're disabling TLS on the reporting-operator side for that component.

if cfg.HiveUseClientCertAuth {
errs = append(errs, fmt.Errorf("hiveUseClientCertAuth cannot be set to true if hiveUseTLS is false"))
}

if len(cfg.HiveCAFile) > 0 {
errs = append(errs, fmt.Errorf("hiveCAFile cannot be set if hiveUseTLS is false"))
}
}

if len(cfg.HiveCAFile) > 0 {
if _, err := os.Stat(cfg.HiveCAFile); err != nil {
errs = append(errs, err)
}
}

if (len(cfg.HiveClientCertFile) > 0 && len(cfg.HiveClientKeyFile) == 0) ||
(len(cfg.HiveClientKeyFile) > 0 && len(cfg.HiveClientCertFile) == 0) {
errs = append(errs, fmt.Errorf("hiveClientCertFile and hiveClientKeyFile must both be specified or neither specified"))
}

if len(errs) > 0 {
return errors.NewAggregate(errs)
}
return nil
}

// IsValidKubeConfig ensures the kube config is set to a valid file if provided.
func IsValidKubeConfig(kubeconfig string) error {
if len(kubeconfig) > 0 {
if _, err := os.Stat(kubeconfig); err != nil {
return err
}
}
return nil
}

// IsValidPrometheusConfig ensures prometheus configuration is valid.
func IsValidPrometheusConfig(cfg *Config) error {
errs := []error{}
if cfg.PrometheusConfig.CAFile != "" {
if _, err := os.Stat(cfg.PrometheusConfig.CAFile); err != nil {
errs = append(errs, err)
}
}

// PrometheusDataSourceMaxBackfillImportDuration overrides PrometheusDataSourceGlobalImportFromTime
// don't set both.
if cfg.PrometheusDataSourceGlobalImportFromTime != nil && cfg.PrometheusDataSourceMaxBackfillImportDuration > 0 {
errs = append(errs, fmt.Errorf("prometheusDataSourceGlobalImportFromTime and prometheusDataSourceMaxBackfillImportDuration cannot both be set"))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previously we stated that one overrides the other. Introducing this will cause explicit failure. Acceptable?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure where I land on this validation check. I think it sounds reasonable looking at the codebase and making sure we reject configurations that won't be respected. @bentito any opinion on this validation check.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bentito - thoughts on this one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading the usage in start.go, it seems like this validates the stated usage. If it's Prom import question, they're both just ways of getting at how far back to try to grab Prom data when metering somehow gets out of sync. I think PrometheusDataSourceGlobalImportFromTime has more potential to be pretty awful for cluster workload, but I don't think that's being asked here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just validation of input. Sounds like I can leave it. Thanks

}

if len(errs) > 0 {
return errors.NewAggregate(errs)
}
return nil
}

// IsValidTLSConfig ensures the TLS config is valid.
func isValidTLSConfig(cfg *TLSConfig) error {
if cfg.UseTLS {
if cfg.TLSCert == "" {
return fmt.Errorf("must set TLS certificate if TLS is enabled")
}
if cfg.TLSKey == "" {
return fmt.Errorf("must set TLS private key if TLS is enabled")
}
}
return nil
}

// isValidHostPort attempts to split a non empty hp into host and port, returning any errors found.
// TODO this is only validating non-empty strings. We may want to check for empty strings an report errors.
// TODO this requires a port to be specified, is that one of our requirements?
func isValidHostPort(hp string, name string) error {
if len(hp) > 0 {
if _, _, err := net.SplitHostPort(hp); err != nil {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will fail if a port is not given - can someone confirm that that is a correct behavior?

Copy link
Member

@bentito bentito Aug 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This gets called on api, metrics, pprof, presto, hive

I think it's valid to make the admin config the port for our API.
Metrics = Prom, has a default, might be ok to not spec.
pprof seems like it has a default port as well, so maybe okay to not spec.
Presto has a default port, might be ok to not spec.
Hive has a default thrift service port as well, so maybe that might not need being requeired to specify either.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I will remove the calls/tests for apis that have default ports.

return fmt.Errorf("invalid %s: %s", name, err.Error())
}
}
return nil
}
Loading