假设我在一月份有 1 个月或(31 天)的以下时间数据:
#-----------------------------------------------------------
# LOAD THE DATASET
#-----------------------------------------------------------
import pandas as pd
df = pd.read_csv('azure.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
#df = df.set_index('timestamp')
df.head()
#| | timestamp | max cpu | min cpu | avg cpu |
#|---:|:--------------------|------------:|------------:|------------:|
#| 0 | 2024-01-01 00:00:00 | 2.15705e+07 | 1.18587e+07 | 5.69916e+06 |
#| 1 | 2024-01-01 00:05:00 | 2.11123e+07 | 1.31363e+07 | 5.6416e+06 |
#| 2 | 2024-01-01 00:10:00 | 2.09826e+07 | 1.14945e+07 | 5.07762e+06 |
#| 3 | 2024-01-01 00:15:00 | 1.82659e+07 | 1.30294e+07 | 4.23199e+06 |
#| 4 | 2024-01-01 00:20:00 | 1.94654e+07 | 1.06001e+07 | 6.21284e+06 |
# Data preparation
# ==============================================================================
sliced_df = df[['timestamp', 'avg cpu']]
# convert column to datetime object
#sliced_df['timestamp'] = pd.to_datetime(sliced_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
#my input data (sliced of above dataframe):
#| | timestamp | avg cpu |
#|---:|:--------------------|------------:|
#| 0 | 2024-01-01 00:00:00 | 5.69916e+06 |
#| 1 | 2024-01-01 00:05:00 | 5.6416e+06 |
#| 2 | 2024-01-01 00:10:00 | 5.07762e+06 |
#| 3 | 2024-01-01 00:15:00 | 4.23199e+06 |
#| 4 | 2024-01-01 00:20:00 | 6.21284e+06 |
# get the hour, day month
sliced_df['hour'] = sliced_df['timestamp'].dt.hour
sliced_df['day'] = sliced_df['timestamp'].dt.day
sliced_df['month'] = sliced_df['timestamp'].dt.month
在以下受此来源启发的示例中,具有以下年份时间数据,具有以下输入数据:
| day | max_temp_f | min_temp_f | precip_in | dayofyear | year |
|:--------------------|-------------:|-------------:|------------:|------------:|-------:|
| 1948-01-01 00:00:00 | 50 | 44 | 0 | 1 | 1948 |
| 1948-01-02 00:00:00 | 42 | 37 | 0 | 2 | 1948 |
| 1948-01-03 00:00:00 | 45 | 36 | 0 | 3 | 1948 |
| 1948-01-04 00:00:00 | 44 | 35 | 0 | 4 | 1948 |
| 1948-01-05 00:00:00 | 45 | 32 | 0 | 5 | 1948 |
他们这样处理数据准备和聚合和过滤:
# get average and 2023
df_avg = df.loc[df["year"].between(1990, 2020)].groupby("dayofyear").mean()
df_2023 = df[df.year == 2023]
df_above = df_2023[["dayofyear", "max_temp_f"]].merge(
df_avg.reset_index()[["dayofyear", "max_temp_f"]],
on="dayofyear",
suffixes=("_2023", "_avg"),
)
df_above["max_temp_f"] = df_above["max_temp_f_avg"]
df_above["max_temp_f"] = df_above.loc[df_above["max_temp_f_2023"] >= df_above["max_temp_f_avg"], "max_temp_f_2023"]
df_below = df_2023[["dayofyear", "max_temp_f"]].merge(
df_avg.reset_index()[["dayofyear", "max_temp_f"]],
on="dayofyear",
suffixes=("_2023", "_avg"),
)
df_below["max_temp_f"] = df_below["max_temp_f_avg"]
df_below["max_temp_f"] = df_below.loc[df_below["max_temp_f_2023"] < df_below["max_temp_f_avg"], "max_temp_f_2023"]
days_above = df_above.query("max_temp_f_2023 >= max_temp_f_avg")["max_temp_f"].size
days_below = df_below.query("max_temp_f_2023 < max_temp_f_avg")["max_temp_f"].size
| day | max_temp_f | min_temp_f | precip_in | dayofyear | year |
|:--------------------|-------------:|-------------:|------------:|------------:|-------:|
| 1948-01-01 00:00:00 | 50 | 44 | 0 | 1 | 1948 |
| 1948-01-02 00:00:00 | 42 | 37 | 0 | 2 | 1948 |
| 1948-01-03 00:00:00 | 45 | 36 | 0 | 3 | 1948 |
| 1948-01-04 00:00:00 | 44 | 35 | 0 | 4 | 1948 |
| 1948-01-05 00:00:00 | 45 | 32 | 0 | 5 | 1948 |
多年来的输出示例:
我想以较小的时间分辨率(每小时\每天\每月)而不是每年来检索、反映和可视化我的数据,如上面的示例所示。我需要检索数据帧观察中列avg cpu
的CPU使用率超过elow:
输出2:x 轴:1 月 21 日记录的每小时时间戳
- y 轴:1 月 21 日的“平均 CPU”使用率
- 阈值线:1 月 21 日的平均 CPU 使用率
x 轴:1 月(1 日至 31 日)的每日时间戳
处理我的数据帧以实现此目的的优雅方法是什么以及相应的感兴趣的 outpu1 和 output2 的查找阈值?
- y 轴:1 月 21 日的“平均 CPU”使用率
- 阈值线:一月份的平均 CPU 使用率
#-----------------------------------------------------------
# Generate data
#-----------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Generate time data
start_time = pd.to_datetime('2024-01-01 00:00:00')
end_time = start_time + pd.DateOffset(hours=24*30)
time_range = pd.date_range(start=start_time, end=end_time, freq='5min')
## Generating a random float values for CPU
min_cpu_values = np.random.uniform(2.23199e+06 , 3.21284e+06, len(time_range)) # range [a, b)
avg_cpu_values = np.random.uniform(4.23199e+06 , 6.21284e+06, len(time_range)) # range [a, b)
max_cpu_values = np.random.uniform(7.23199e+06 , 8.21284e+06, len(time_range)) # range [a, b)
df = pd.DataFrame({'timestamp': time_range,
'min_cpu': min_cpu_values,
'avg_cpu': avg_cpu_values,
'max_cpu': max_cpu_values,
})
df
#-----------------------------------------------------------
# plot data
#-----------------------------------------------------------
signal1 = df.min_cpu
signal2 = df.avg_cpu
signal3 = df.max_cpu
fig, axes = plt.subplots(1,1, figsize=(6,3))
plt.plot(df.timestamp, signal1)
plt.plot(df.timestamp, signal2)
plt.plot(df.timestamp, signal3)
#Adjust the ylim to go 10% above and below from the signal amplitudes
#axes.set_ylim(signal.min()+signal.min()*0.0000005, signal.max()+signal.max()*0.05)
axes.grid(True)
axes.set_xticklabels(axes.xaxis.get_majorticklabels(), rotation=25)
axes.set_ylabel('CPU usage\n (Signal amplitude)')
axes.set_xlabel('timestamp')
axes.set_title('Generated data over time for a month')
the daily average for one certain day ( i.e., 21st Jan. 2024-01-21 00:00:00 till 2024-01-22 00:00:00)
实际上是想要每小时平均值。而您的
The monthly average for Jan.
实际上是 1 月份的每日平均值。我将为您提供一天的每小时平均值。你可以从他们的观点中得出你想要的其他观点。您需要决定如何计算平均值。
您还可以玩转颜色和美学。
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.read_csv('C:/Users/SchvaJ01/Downloads/azure.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
# get the hour, day month
df['hour'] = df['timestamp'].dt.hour
df['dayofyear'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year
def daily_analysis(df, year_input, month_input, day_input):
# get hourly average and specific day
# Group by hour, and calculate the max of 'max cpu' and min of 'min cpu' for each hour
df_avg = df.groupby('hour').agg({'max cpu': 'mean'}).reset_index()
df_day = df[(df.year == year_input) & (df.month == month_input) & (df.dayofyear == day_input)]
df_day = df_day.groupby('hour').agg({'max cpu': 'max'}).reset_index()
df_above_below = pd.merge(df_day, df_avg, on='hour', suffixes=('_hour','_avg'))
df_above_below['above_below'] = df_above_below.apply(lambda row: 'above' if row['max cpu_hour'] > row['max cpu_avg'] else 'below', axis=1)
df_above_below['above_below_value'] = df_above_below['max cpu_hour'] - df_above_below['max cpu_avg']
above_count = df_above_below['above_below'].value_counts()['above']
below_count = df_above_below['above_below'].value_counts()['below']
df_graph = df_above_below.copy()
sns.set_style("darkgrid", {"axes.facecolor": "#A9A9A9"}) # Setting the background to a specific hex color
plt.figure(figsize=(14, 8))
# Applying a polynomial regression with seaborn lineplot for smoothing
sns.lineplot(x='hour', y='max cpu_hour', data=df_graph, color='white', label='Max CPU Hour', marker='', sort=False, errorbar=None, estimator=None)
sns.lineplot(x='hour', y='max cpu_avg', data=df_graph, color='lightblue', label='Max CPU Avg', marker='', sort=False, errorbar=None, estimator=None)
# Calculating the smoothed values for filling
# Since seaborn does not directly support filling between lines with regression fits, we revert to the original method for demonstration
plt.fill_between(df_graph['hour'], df_graph['max cpu_hour'], df_graph['max cpu_avg'],
where=(df_graph['max cpu_hour'] >= df_graph['max cpu_avg']), facecolor='green', interpolate=True, alpha=0.3)
plt.fill_between(df_graph['hour'], df_graph['max cpu_hour'], df_graph['max cpu_avg'],
where=(df_graph['max cpu_hour'] < df_graph['max cpu_avg']), facecolor='red', interpolate=True, alpha=0.3)
# Customizing the plot with dark theme adjustments
plt.title('Hourly Max CPU Comparison with Average', color='white')
plt.xlabel('Hour', color='black')
plt.ylabel('Max CPU', color='black')
plt.xticks(df_graph['hour'], color='black')
plt.yticks(color='black')
plt.legend()
plt.show()