'extract longest non NaN sequence from array

I want to extract the longest sequence of consecutive non NaN values from an array in Python. So from this one:

a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]

I would like to get

a_nonNaN_long = [4, 6, 8, 4, 6, 6, 4, 3, 2]

So the way I was thinking to go about this is to get the first non NaN value using this function

def firstNonNan(listfloats):
    i = 0
    for item in listfloats:
        i += 1
        if math.isnan(item) == False:
            return i

And then use the index from this function in a while loop to get subsection of the array until I find the longest consecutive sequence of Non nan values. I wonder if anybody has some other/better way to do it?



Solution 1:[1]

You can use itertools.groupby to get non-NaN stretches, then max for the longest:

from itertools import groupby
import math

out = max((list(g) for k,g in groupby(a, math.isnan) if not k), key=len)

Output:

[4, 6, 8, 4, 6, 6, 4, 3, 2]

Used input:

NaN = float('nan')
a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]

Solution 2:[2]

To see whether numpy based vectorized approaches or pure python algorithms to find the bounds of non-NaN sequences could be performance competitive with a python groupby answer such as the one by @mozway, I used perfplot to benchmark four strategies: pure python with groupby, pure python without groupby, numpy and numba.

In one run, I used a python list as input (as in the question), which puts the onus on the numpy and numba runs to do the round-trip conversion from list to np.array and back, as well as to deal with heterogeneous data types given that NaN is float and the values are int.

Here's how it looks:

enter image description here

In the second run, I used a python list as input for the non-numpy solutions, and an np.array as input (with NaN replaced by an integer sentinel outside the range of actual values, to allow a dtype of int32) and output, to see whether recasting the problem in numpy-friendly array types would help the numpy/numba solutions.

Here are the results:

enter image description here

Conclusion: The numpy and numba solutions are about 1.5 orders of magnitude faster if they are allowed to use homogeneous numpy input and output. Otherwise (if they must incur the roundtrip overhead of list-to-homogenous-numpy conversion and back) they are more or less on top of the pure python solutions.

Pure python groupby beats non-groupby by a bit, and in the second case, numba beats numpy by a little.

Note that none of the others beats the groubpy solution by @mozway for simplicity of expression.

Here is the benchmark code:

NaN = float("nan")
a = [NaN, NaN, NaN, 1, 4, NaN, NaN, NaN, 4, 6, 8, 4, 6, 6, 4, 3, 2, NaN, NaN, NaN, 2, NaN, NaN, NaN]

import numpy as np
aNp = np.array([0 if v is NaN else v for v in a], np.int32)

def foo_1(a):
    mnMinus1 = min(v for v in a if not v is NaN) - 1
    np_a = np.array([mnMinus1 if v is NaN else v for v in a], np.int32)
    return list(np_foo_1(np_a, mnMinus1))
def foo_2(a):
    mnMinus1 = min(v for v in a if not v is NaN) - 1
    np_a = np.array([mnMinus1 if v is NaN else v for v in a], np.int32)
    return list(np_foo_2(np_a, mnMinus1))
def np_foo_1(b, mnMinus1):
    R = np.concatenate((np.array([mnMinus1]), b[:-1]))
    bIsnan = b==mnMinus1
    RIsnan = R==mnMinus1
    nonNanL = (~bIsnan) & RIsnan
    nonNanR = bIsnan & ~RIsnan
    index = np.arange(len(b))
    left, right = index[nonNanL], index[nonNanR]
    lens = right - left
    i = lens.argmax()
    result = b[left[i]:right[i]]
    return result

from numba import njit
@njit
def np_foo_2(b, mnMinus1):
    R = np.concatenate((np.array([mnMinus1]), b[:-1]))
    bIsnan = b==mnMinus1
    RIsnan = R==mnMinus1
    nonNanL = (~bIsnan) & RIsnan
    nonNanR = bIsnan & ~RIsnan
    index = np.arange(len(b))
    left, right = index[nonNanL], index[nonNanR]
    lens = right - left
    i = lens.argmax()
    result = b[left[i]:right[i]]
    return result

def foo_3(a):
    LR = []
    left, right = 0, 0
    while left < len(a):
        while right < len(a) and a[right] is not NaN:
            right += 1
        LR.append((left, right))
        left = right + 1
        while left < len(a) and a[left] is NaN:
            left += 1
        right = left + 1
    #i = max(zip(LR, range(len(LR))), key = lambda x: x[0][1] - x[0][0])[-1]
    i, mx = 0, 0
    for j in range(len(LR)):
        cur = LR[j][1] - LR[j][0]
        if cur > mx:
            i, mx = j, cur
    return a[LR[i][0]:LR[i][1]]
    
        
    left = [i for i, v in enumerate(a) if v is not NaN and (not i or a[i-1] is NaN)]
    right = [i for i, v in enumerate(a) if v is NaN and (False if not i else a[i-1] is not NaN)]
    #i = max(zip(left, right, range(len(left))), key = lambda x: x[1] - x[0])[-1]
    i, mx = 0, 0
    for j in range(len(left)):
        cur = right[j] - left[j]
        if cur > mx:
            i, mx = j, cur
    return a[left[i]:right[i]]

from itertools import groupby
import math
def foo_4(a):
    out = max((list(g) for k,g in groupby(a, math.isnan) if not k), key=len)
    return out

def dual2_foo_1(a, np_a):
    return foo_1(a)
def dual2_foo_2(a, np_a):
    return foo_2(a)
def dual2_foo_3(a, np_a):
    return foo_3(a)
def dual2_foo_4(a, np_a):
    return foo_4(a)
    
def dual_foo_1(a, np_a):
    return np_foo_1(np_a, 0)
def dual_foo_2(a, np_a):
    return np_foo_2(np_a, 0)
def dual_foo_3(a, np_a):
    return foo_3(a)
def dual_foo_4(a, np_a):
    return foo_4(a)
    

foo_count = 4
foo_names=['foo_' + str(i + 1) for i in range(foo_count)]
foo_labels=['numpy', 'numpy_numba', 'python_find_seq_bounds', 'python_groupby']
exec("foo_funcs=[" + ','.join(f"foo_{str(i + 1)}" for i in range(foo_count)) + "]")
exec("dual_foo_funcs=[" + ','.join(f"dual_foo_{str(i + 1)}" for i in range(foo_count)) + "]")
exec("dual2_foo_funcs=[" + ','.join(f"dual2_foo_{str(i + 1)}" for i in range(foo_count)) + "]")
for foo in foo_names:
    print(f'{foo} output:')
    print(eval(f"{foo}(a)"))


import perfplot
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["figure.autolayout"] = True

perfplot.show(
    setup=lambda n: (a * n, np.array([0 if v is NaN else v for v in a * n], np.int32)),
    kernels=dual_foo_funcs,
    labels=foo_labels,
    n_range=[2 ** k for k in range(11)],
    equality_check=np.allclose,
    xlabel='n/24',
    logx="auto",
    logy="auto"
)

perfplot.show(
    setup=lambda n: (a * n, np.array([0 if v is NaN else v for v in a * n], np.int32)),
    kernels=dual2_foo_funcs,  # switch to dual_foo_funcs for second benchmark
    labels=foo_labels,
    n_range=[2 ** k for k in range(11)],
    equality_check=np.allclose,
    xlabel='n/24',
    logx="auto",
    logy="auto"
)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 mozway
Solution 2