
问题描述 投票:5回答:3



Per C   V
1   c   3
1   a   4
1   c   1
2   a   6
2   b   5
3   j   7
4   x   6
4   x   5
4   a   9
5   a   2
6   c   3
6   k   6


Per C   V
1   c   4
1   a   4
2   c   4
2   a   10
2   b   5
3   c   4
3   a   10
3   b   5
3   j   7
4   c   4
4   a   19
4   b   5
4   j   7
4   x   11
5   c   4
5   a   21
5   b   5
5   j   7
5   x   11
6   c   7
6   a   21
6   b   5
6   j   7
6   x   11
6   k   6
pandas pandas-groupby


    pd.concat([df.loc[df.Per<=i][['C','V']].assign(Per=i) for i in df.Per.unique()])

    Per  C   V
0     1  a   4
1     1  c   4
2     2  a  10
3     2  b   5
4     2  c   4
5     3  a  10
6     3  b   5
7     3  c   4
8     3  j   7
9     4  a  19
10    4  b   5
11    4  c   4
12    4  j   7
13    4  x  11
14    5  a  21
15    5  b   5
16    5  c   4
17    5  j   7
18    5  x  11
19    6  a  21
20    6  b   5
21    6  c   7
22    6  j   7
23    6  k   6
24    6  x  11




s = df.set_index(['Per', 'C']).V.sum(level=[0, 1])

    pd.MultiIndex.from_product(s.index.levels, names=s.index.names),
).groupby('C').cumsum().loc[lambda x: x > 0].reset_index()

    Per  C   V
0     1  a   4
1     1  c   4
2     2  a  10
3     2  b   5
4     2  c   4
5     3  a  10
6     3  b   5
7     3  c   4
8     3  j   7
9     4  a  19
10    4  b   5
11    4  c   4
12    4  j   7
13    4  x  11
14    5  a  21
15    5  b   5
16    5  c   4
17    5  j   7
18    5  x  11
19    6  a  21
20    6  b   5
21    6  c   7
22    6  j   7
23    6  k   6
24    6  x  11



(df.pivot_table(index='Per', columns='C', values='V', aggfunc='sum')
   .replace(0, np.nan)


    Per  C     0
0     1  a   4.0
1     1  c   4.0
2     2  a  10.0
3     2  b   5.0
4     2  c   4.0
5     3  a  10.0
6     3  b   5.0
7     3  c   4.0
8     3  j   7.0
9     4  a  19.0
10    4  b   5.0
11    4  c   4.0
12    4  j   7.0
13    4  x  11.0
14    5  a  21.0
15    5  b   5.0
16    5  c   4.0
17    5  j   7.0
18    5  x  11.0
19    6  a  21.0
20    6  b   5.0
21    6  c   7.0
22    6  j   7.0
23    6  k   6.0
24    6  x  11.0


In [131]: df.pivot_table(index='Per', columns='C', values='V', aggfunc='sum')
C      a    b    c    j    k     x
1    4.0  NaN  4.0  NaN  NaN   NaN
2    6.0  5.0  NaN  NaN  NaN   NaN
3    NaN  NaN  NaN  7.0  NaN   NaN
4    9.0  NaN  NaN  NaN  NaN  11.0
5    2.0  NaN  NaN  NaN  NaN   NaN
6    NaN  NaN  3.0  NaN  6.0   NaN


pivot_table/cumsum相比,using_concat方法也提供了相当大的速度优势,但piRSquared's solution是最快的。在1000行df

In [169]: %timeit using_reindex2(df)
100 loops, best of 3: 6.86 ms per loop

In [152]: %timeit using_reindex(df)
100 loops, best of 3: 8.36 ms per loop

In [80]: %timeit using_pivot(df)
100 loops, best of 3: 8.58 ms per loop

In [79]: %timeit using_concat(df)
10 loops, best of 3: 84 ms per loop


import numpy as np
import pandas as pd

def using_pivot(df):
    return (df.pivot_table(index='P', columns='C', values='V', aggfunc='sum')
             .replace(0, np.nan)

def using_reindex(df):
    https://stackoverflow.com/a/49097572/190597 (piRSquared)
    s = df.set_index(['P', 'C']).V.sum(level=[0, 1])

    return s.reindex(
        pd.MultiIndex.from_product(s.index.levels, names=s.index.names),
    ).groupby('C').cumsum().loc[lambda x: x > 0].reset_index()

def using_reindex2(df):
    https://stackoverflow.com/a/49097572/190597 (piRSquared)
    with first line changed
    s = df.groupby(['P', 'C'])['V'].sum()
    return s.reindex(
        pd.MultiIndex.from_product(s.index.levels, names=s.index.names),
    ).groupby('C').cumsum().loc[lambda x: x > 0].reset_index()

def using_concat(df):
    https://stackoverflow.com/a/49095139/190597 (Allen)
    return (pd.concat([df.loc[df.P<=i][['C','V']].assign(P=i) 
                       for i in df.P.unique()])

def make(nrows):
    df = pd.DataFrame(np.random.randint(50, size=(nrows,3)), columns=list('PCV'))
    return df

df = make(1000)
© www.soinside.com 2019 - 2024. All rights reserved.