# Copyright 2016 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Time series as :mod:`pandas` dataframes.""" import itertools TOP_RESOURCE_LABELS = ( 'project_id', 'aws_account', 'location', 'region', 'zone', ) def _build_dataframe(time_series_iterable, label=None, labels=None): # pragma: NO COVER """Build a :mod:`pandas` dataframe out of time series. :type time_series_iterable: iterable over :class:`~gcloud.monitoring.timeseries.TimeSeries` :param time_series_iterable: An iterable (e.g., a query object) yielding time series. :type label: string or None :param label: The label name to use for the dataframe header. This can be the name of a resource label or metric label (e.g., ``"instance_name"``), or the string ``"resource_type"``. :type labels: list of strings, or None :param labels: A list or tuple of label names to use for the dataframe header. If more than one label name is provided, the resulting dataframe will have a multi-level column header. Specifying neither ``label`` or ``labels`` results in a dataframe with a multi-level column header including the resource type and all available resource and metric labels. Specifying both ``label`` and ``labels`` is an error. :rtype: :class:`pandas.DataFrame` :returns: A dataframe where each column represents one time series. """ import pandas # pylint: disable=import-error if labels is not None: if label is not None: raise ValueError('Cannot specify both "label" and "labels".') elif not labels: raise ValueError('"labels" must be non-empty or None.') columns = [] headers = [] for time_series in time_series_iterable: pandas_series = pandas.Series( data=[point.value for point in time_series.points], index=[point.end_time for point in time_series.points], ) columns.append(pandas_series) headers.append(time_series.header()) # Implement a smart default of using all available labels. if label is None and labels is None: resource_labels = set(itertools.chain.from_iterable( header.resource.labels for header in headers)) metric_labels = set(itertools.chain.from_iterable( header.metric.labels for header in headers)) labels = (['resource_type'] + _sorted_resource_labels(resource_labels) + sorted(metric_labels)) # Assemble the columns into a DataFrame. dataframe = pandas.DataFrame.from_records(columns).T # Convert the timestamp strings into a DatetimeIndex. dataframe.index = pandas.to_datetime(dataframe.index) # Build a multi-level stack of column headers. Some labels may # be undefined for some time series. levels = [] for key in labels or [label]: level = [header.labels.get(key, '') for header in headers] levels.append(level) # Build a column Index or MultiIndex. Do not include level names # in the column header if the user requested a single-level header # by specifying "label". dataframe.columns = pandas.MultiIndex.from_arrays( levels, names=labels or None) # Sort the rows just in case (since the API doesn't guarantee the # ordering), and sort the columns lexicographically. return dataframe.sort_index(axis=0).sort_index(axis=1) def _sorted_resource_labels(labels): """Sort label names, putting well-known resource labels first.""" head = [label for label in TOP_RESOURCE_LABELS if label in labels] tail = sorted(label for label in labels if label not in TOP_RESOURCE_LABELS) return head + tail