例如,我们有数据集
tips
,其中包含列 day
、total_bill
和 sex
。
我想可视化箱线图(x=day
,y=total_bill
,color=sex
)。之后,我想计算每天女性和男性参与者之间的测试和 p 值。如果 p 值 < 0.05, I want to add asterisk. How could I change the code below?
在此示例中,不同没有性生活的日子之间的比较:
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
tips = px.data.tips()
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
def add_pvalue_annotation(days, y_range, symbol=''):
"""
arguments:
days --- a list of two different days e.g. ['Thur','Sat']
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[tips['day']==days[0]].total_bill,
tips[tips['day']==days[1]].total_bill)[1]
# print(pvalue)
if pvalue >= 0.05:
symbol = 'ns'
if pvalue < 0.05:
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[0], x1=days[0], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[1], x1=days[1], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[1], y0=y_range[1], x1=days[1], y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(['Thur','Fri','Sat','Sun'])}
fig.add_annotation(dict(font=dict(color="black",size=14),
x=(bar_xcoord_map[days[0]] + bar_xcoord_map[days[1]])/2,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
add_pvalue_annotation(['Thur','Sun'],[1.01,1.02])
add_pvalue_annotation(['Thur','Sat'],[1.05,1.06])
fig.show()
我在这里找到了这个有用的例子:绘图框 p 值显着注释
当您设置箱线图时,使用plotly.express中的
px.box
将会很有用,因为您可以传递参数color="sex"
,这将为每天的每个性别创建两个箱线图。您还需要对 tips
DataFrame 进行排序,以便按顺序绘制一周中的日子。
然后可以修改
add_pvalue_annotation
函数,以便我们计算每天内男性和女性之间的 t 检验的 p 值(而不是一周中不同日期的提示之间的 t 检验)。您还需要更改注释的起点和终点,以便它们位于同一天内的男性和女性类别之间,而不是不同的日期之间。
对于
tips
数据集,我在一周内的每一天对男性和女性进行 t 检验(例如,周四男性和女性,周五男性和女性......),并且没有一个 p 值是低于0.05。
但是,为了证明
add_pvalue_annotation
函数能够正确放置注释,我将 p 值阈值设置为 0.15,以便在图表上注释周五 (p-value = 0.13
) 的男性和女性之间的 p 值。
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from pandas.api.types import CategoricalDtype
tips = px.data.tips()
cat_order = ['Thur', 'Fri', 'Sat', 'Sun']
cat_weekdays = CategoricalDtype(cat_order, ordered=True)
tips['day'] = tips['day'].astype(cat_weekdays)
tips.sort_values(by='day', inplace=True)
fig = px.box(tips, x="day", y="total_bill", color="sex")
def add_pvalue_annotation(day, y_range, symbol='', pvalue_th=0.05):
"""
arguments:
days --- the day for which you want to calculate the p-value on a t-test between Men and Women (e.g. 'Thur')
x_coordinate --- the x-coordinate
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[(tips['day']==day) & (tips['sex'] == 'Male')].total_bill,
tips[(tips['day']==day) & (tips['sex'] == 'Female')].total_bill
)[1]
# print(f"pvalue between men and women on {day}: {pvalue}")
# if pvalue >= pvalue_th:
# symbol = 'ns'
if pvalue < pvalue_th:
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(cat_order)}
x_coordinate = bar_xcoord_map[day]
x_start, x_end = x_coordinate - 0.2, x_coordinate + 0.2
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_start, y0=y_range[0], x1=x_start, y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_start, y0=y_range[1], x1=x_end, y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_end, y0=y_range[1], x1=x_end, y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
fig.add_annotation(dict(font=dict(color="black",size=14),
x=x_coordinate,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
for day in cat_order:
add_pvalue_annotation(day, [1.01,1.02], pvalue_th=0.15)
fig.show()
嘿...如果它有用的话,这里有一个Python库可以在Plotly图上进行统计注释:
https://pypi.org/project/taplib/ https://github.com/FedericaPersiani/tap