データフレーム内から各ファセットの直近3日間の有効データの平均を求める – 各行についてNaNを除いた後尾n列の平均を求める

How to calculate the average of the most recent three non-nan value using Python – StackOverflow

回答で挙げられているNumpyソリューションがエレガントで色々な場面で活用できそう.時間効率を考えれば,Numpyソリューションがリーズナブルチョイスだが,簡便さを考えれば,Pandasを用いればより簡便に考える事ができる.ただ,色々なやり方が考えられるし,結局pandasとnumpyのトレードオフみたいな感じ.1つ目がPandasらしく,かつ内部実装はnumpyなのである程度効率も得つつ,という感じかな.というか,それ以上考え出すと,本末転倒.

import io
import pandas as pd


strings = """name day1 day2 day3 day4  day5 day6 day7
A    1     1   nan   2    3    0   3
B    nan   nan nan   nan  nan  nan 3
C    1     1   0     1    1    1   1
D    1     1   0     1    nan  1   4"""
df = pd.read_csv(io.StringIO(strings), sep='\s+')
print(df)

v = df.iloc[:, 1:]
cond = v.notna().cumsum(1).ge(3)
print(df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1)))
%timeit v = df.iloc[:, 1:];cond = v.notna().cumsum(1).ge(3);df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1))

v = df.select_dtypes(include=[np.number])
cond = v.notna().cumsum(1).ge(3)
print(df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1)))
%timeit v = df.select_dtypes(include=[np.number]);cond = v.notna().cumsum(1).ge(3);df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1))
  name  day1  day2  day3  day4  day5  day6  day7
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 3.42 ms per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 4.26 ms per loop

 
 
 

import io
import pandas as pd
import numpy as np


def justify(a, invalid_val=0, axis=1, side='left'):    
    """
    Justifies a 2D array

    Parameters
    ----------
    A : ndarray
        Input array to be justified
    axis : int
        Axis along which justification is to be made
    side : str
        Direction of justification. It could be 'left', 'right', 'up', 'down'
        It should be 'left' or 'right' for axis=1 and 'up' or 'down' for axis=0.

    """

    if invalid_val is np.nan:
        mask = ~np.isnan(a)
    else:
        mask = a!=invalid_val
    justified_mask = np.sort(mask,axis=axis)
    if (side=='up') | (side=='left'):
        justified_mask = np.flip(justified_mask,axis=axis)
    out = np.full(a.shape, invalid_val) 
    if axis==1:
        out[justified_mask] = a[mask]
    else:
        out.T[justified_mask.T] = a.T[mask.T]
    return out


strings = """name day1 day2 day3 day4  day5 day6 day7
A    1     1   nan   2    3    0   3
B    nan   nan nan   nan  nan  nan 3
C    1     1   0     1    1    1   1
D    1     1   0     1    nan  1   4"""
df = pd.read_csv(io.StringIO(strings), sep='\s+')
print(df)

N = 3 # last N entries for averaging
avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1)
print(df.assign(expected=avg))
%timeit avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1);df.assign(expected=avg)

cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool)
print(df.assign(expected=df.iloc[:, 1:][cond].mean(1)))
%timeit cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool);df.assign(expected=df.iloc[:, 1:][cond].mean(1))

v = df.iloc[:, 1:]
cond = v.notna().cumsum(1).ge(3)
print(df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1)))
%timeit v = df.iloc[:, 1:];cond = v.notna().cumsum(1).ge(3);df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1))

df = pd.concat([df]*100000, ignore_index=True)
%timeit avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1);df.assign(expected=avg)
%timeit cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool);df.assign(expected=df.iloc[:, 1:][cond].mean(1))
%timeit v = df.iloc[:, 1:];cond = v.notna().cumsum(1).ge(3);df.assign(expected=v[cond].ffill(1).iloc[:, -3:].mean(1))
  name  day1  day2  day3  day4  day5  day6  day7
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
1000 loops, best of 3: 740 µs per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 3.7 ms per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 3.42 ms per loop
10 loops, best of 3: 78.7 ms per loop
10 loops, best of 3: 162 ms per loop
10 loops, best of 3: 131 ms per loop

 
 

import io
import pandas as pd
import numpy as np


def justify(a, invalid_val=0, axis=1, side='left'):    
    """
    Justifies a 2D array

    Parameters
    ----------
    A : ndarray
        Input array to be justified
    axis : int
        Axis along which justification is to be made
    side : str
        Direction of justification. It could be 'left', 'right', 'up', 'down'
        It should be 'left' or 'right' for axis=1 and 'up' or 'down' for axis=0.

    """

    if invalid_val is np.nan:
        mask = ~np.isnan(a)
    else:
        mask = a!=invalid_val
    justified_mask = np.sort(mask,axis=axis)
    if (side=='up') | (side=='left'):
        justified_mask = np.flip(justified_mask,axis=axis)
    out = np.full(a.shape, invalid_val) 
    if axis==1:
        out[justified_mask] = a[mask]
    else:
        out.T[justified_mask.T] = a.T[mask.T]
    return out


strings = """name day1 day2 day3 day4  day5 day6 day7
A    1     1   nan   2    3    0   3
B    nan   nan nan   nan  nan  nan 3
C    1     1   0     1    1    1   1
D    1     1   0     1    nan  1   4"""
df = pd.read_csv(io.StringIO(strings), sep='\s+')
print(df)

N = 3 # last N entries for averaging
avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1)
print(df.assign(expected=avg))
%timeit avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1);df.assign(expected=avg)

cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool)
print(df.assign(expected=df.iloc[:, 1:][cond].mean(1)))
%timeit cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool);df.assign(expected=df.iloc[:, 1:][cond].mean(1))

v = df.iloc[:, 1:]
arr = v.values[:, ::-1]
cond = ((~np.isnan(arr)).cumsum(1) == 3)[:, ::-1].cumsum(1).astype(bool)
cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns)
print(df.assign(expected=v[cond_df].mean(1)))
%timeit v = df.iloc[:, 1:];arr = v.values[:, ::-1];cond = ((~np.isnan(arr)).cumsum(1) == 3)[:, ::-1].cumsum(1).astype(bool);cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns);df.assign(expected=v[cond_df].mean(1))

v = df.iloc[:, :0:-1]
cond = v.notna().cumsum(1).where(lambda x: x==3).bfill(1).fillna(0).astype(bool)
print(df.assign(expected=v[cond].mean(1)))
%timeit cond = v.notna().cumsum(1).where(lambda x: x==3).bfill(1).fillna(0).astype(bool);df.assign(expected=v[cond].mean(1))

v = df.iloc[:, :0:-1]
arr = v.notna().cumsum(1).values
cond = (arr == 3).any(1)[:, None] & (arr <= 3)
cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns)
print(df.assign(expected=v[cond_df].mean(1)))
%timeit v = df.iloc[:, :0:-1];arr = v.notna().cumsum(1).values;cond = (arr == 3).any(1)[:, None] & (arr <= 3);cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns);df.assign(expected=v[cond_df].mean(1))


df = pd.concat([df]*100000, ignore_index=True)
%timeit avg = np.mean(justify(df.iloc[:, 1:].values,invalid_val=np.nan,axis=1, side='right')[:,-N:],1);df.assign(expected=avg)
%timeit cond = df.iloc[:, :0:-1].notna().cumsum(1).eq(3).iloc[:, ::-1].cumsum(1).astype(bool);df.assign(expected=df.iloc[:, 1:][cond].mean(1))
%timeit v = df.iloc[:, 1:];arr = v.values[:, ::-1];cond = ((~np.isnan(arr)).cumsum(1) == 3)[:, ::-1].cumsum(1).astype(bool);cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns);df.assign(expected=v[cond_df].mean(1))
%timeit v = df.iloc[:, :0:-1];cond = v.notna().cumsum(1).where(lambda x: x==3).bfill(1).fillna(0).astype(bool);df.assign(expected=v[cond].mean(1))
%timeit v = df.iloc[:, :0:-1];arr = v.notna().cumsum(1).values;cond = (arr == 3).any(1)[:, None] & (arr <= 3);cond_df = pd.DataFrame(cond, index=v.index, columns=v.columns);df.assign(expected=v[cond_df].mean(1))
  name  day1  day2  day3  day4  day5  day6  day7
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
1000 loops, best of 3: 739 µs per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 3.69 ms per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 2.68 ms per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 4.4 ms per loop
  name  day1  day2  day3  day4  day5  day6  day7  expected
0    A   1.0   1.0   NaN   2.0   3.0   0.0     3       2.0
1    B   NaN   NaN   NaN   NaN   NaN   NaN     3       NaN
2    C   1.0   1.0   0.0   1.0   1.0   1.0     1       1.0
3    D   1.0   1.0   0.0   1.0   NaN   1.0     4       2.0
100 loops, best of 3: 3.11 ms per loop
10 loops, best of 3: 78.1 ms per loop
10 loops, best of 3: 163 ms per loop
10 loops, best of 3: 137 ms per loop
1 loop, best of 3: 199 ms per loop
10 loops, best of 3: 130 ms per loop
カテゴリー: 未分類 パーマリンク

コメントを残す

以下に詳細を記入するか、アイコンをクリックしてログインしてください。

WordPress.com ロゴ

WordPress.com アカウントを使ってコメントしています。 ログアウト /  変更 )

Google フォト

Google アカウントを使ってコメントしています。 ログアウト /  変更 )

Twitter 画像

Twitter アカウントを使ってコメントしています。 ログアウト /  変更 )

Facebook の写真

Facebook アカウントを使ってコメントしています。 ログアウト /  変更 )

%s と連携中

このサイトはスパムを低減するために Akismet を使っています。コメントデータの処理方法の詳細はこちらをご覧ください