我用python自学时间序列分解。我收到 float() 类型错误。
我的数据来源是政府公开数据。基本上,我想直观地看到新冠旅行限制结束后高铁客流的趋势。时间段为2023年1月至11月
第一部分是数据清理和分项设置。第二部分是时间序列和按月分解。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
from pylab import rcParams
df = pd.read_csv("https://www.immd.gov.hk/opendata/eng/transport/immigration_clearance/statistics_on_daily_passenger_traffic.csv")
# data cleaning
df = df.iloc[: , :-1]
df = df[df["Date"].str.contains("2023") == True]
from datetime import date
from datetime import datetime
df["Date"] = df["Date"].apply(lambda x: datetime.strptime(str(x), "%d-%m-%Y"))
control_point = df['Control Point'].tolist()
options = ['Airport', 'Express Rail Link West Kowloon', 'Lo Wu', 'Lok Ma Chau Spur Line', 'Heung Yuen Wai', 'Hong Kong-Zhuhai-Macao Bridge', 'Shenzhen Bay']
df_clean = df.loc[df['Control Point'].isin(options)]
df_XRL = df[df["Control Point"].str.contains("Express Rail Link West Kowloon") & df["Arrival / Departure"].str.contains("Departure")]
df_XRL = df_XRL[["Date","Hong Kong Residents"]]
df_XRL = df_XRL[~(df_XRL['Date'] > '2023-11-30')]
df_XRL['Month'] = pd.DatetimeIndex(df_XRL['Date']).strftime("%b")
df_XRL['Week day'] = pd.DatetimeIndex(df_XRL['Date']).strftime("%a")
# Pivot table
from numpy import nan
monthOrder = ['Jan', 'Feb', 'Mar', 'Apr','May','Jun','Jul','Aug','Sep','Oct','Nov']
dayOrder = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
pivot_XRL = pd.pivot_table(df_XRL, index=['Month'],
values=['Hong Kong Residents'],
columns=['Week day'], aggfunc=('sum')).loc[monthOrder, (slice(None), dayOrder)]
# Time Series Decomposition - where errors occur
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(df_XRL, model = "additive")
decomposition.plot()
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
错误信息:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6628\3049465439.py in <module>
1 from statsmodels.tsa.seasonal import seasonal_decompose
----> 2 decomposition = seasonal_decompose(df_XRL, model = "additive")
3 decomposition.plot()
4 plt.rcParams['axes.labelsize'] = 16
5 plt.rcParams['axes.titlesize'] = 16
~\anaconda3\lib\site-packages\statsmodels\tsa\seasonal.py in seasonal_decompose(x, model, filt, period, two_sided, extrapolate_trend)
140 pfreq = getattr(getattr(x, "index", None), "inferred_freq", None)
141
--> 142 x = array_like(x, "x", maxdim=2)
143 nobs = len(x)
144
~\anaconda3\lib\site-packages\statsmodels\tools\validation\validation.py in array_like(obj, name, dtype, ndim, maxdim, shape, order, contiguous, optional)
133 if optional and obj is None:
134 return None
--> 135 arr = np.asarray(obj, dtype=dtype, order=order)
136 if maxdim is not None:
137 if arr.ndim > maxdim:
~\anaconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
2082 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
2083 values = self._values
-> 2084 arr = np.asarray(values, dtype=dtype)
2085 if (
2086 astype_is_view(values.dtype, arr.dtype)
TypeError: float() argument must be a string or a number, not 'Timestamp'
我发现了错误。这个问题是我没有将日期设置为索引。添加脚本后:
df_XRL = df_XRL.set_index('Date')
,我可以绘制图表了。