PPG Pre-processing and SQI Calculations using Pandas¶

Trimming, Filtering, Segmentation and SQI Calculations utilising Pandas

 # Code Adapted from:
 # https://meta00.github.io/vital_sqi/_examples/others/plot_pipeline_02.html
 # AND
 # https://meta00.github.io/vital_sqi/_examples/others/plot_read_signal.html#sphx-glr-examples-others-plot-read-signal-py

 '''
 This file will be used in calculating the SQIs for each patient (and discarding parts of the signal soon).
 A loop will be implemented in future iterations for the different patiend files. In general, the file is built
 parametrically so that the procedure can be automated.

 '''

Out:

'\nThis file will be used in calculating the SQIs for each patient (and discarding parts of the signal soon).\nA loop will be implemented in future iterations for the different patiend files. In general, the file is built\nparametrically so that the procedure can be automated.\n\n'

Importing Libraries

 # Generic
 import os
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from scipy import signal
 from datetime import datetime

 # Scipy
 from scipy.stats import skew, kurtosis, entropy

  # vitalSQI
 from vital_sqi.data.signal_io import PPG_reader
 import vital_sqi.highlevel_functions.highlevel as sqi_hl
 import vital_sqi.data.segment_split as sqi_sg
 from vital_sqi.common.rpeak_detection import PeakDetector
 from vital_sqi.preprocess.band_filter import BandpassFilter
 import vital_sqi.sqi as sq

Loading data

 # Filepath definition
 filepath = r'..\..\..\..\OUCRU\01NVa_Dengue\Adults\01NVa-003-2001\PPG'
 filename = r'01NVa-003-2001 Smartcare.csv'
 filename_Clinical = r'..\..\..\..\OUCRU\Clinical\v0.0.10\01nva_data_stacked_corrected.csv'



 #defining constants
 trim_amount = 300
 hp_filt_params = (1, 1) #(Hz, order)
 lp_filt_params = (20, 4) #(Hz, order)
 filter_type =  'butter'
 segment_length = 30
 sampling_rate = 100

 #reading Clinical Data
 Clinical = pd.read_csv(filename_Clinical)

 #readind PPG data

 data = pd.read_csv(os.path.join(filepath, filename))

 # data = PPG_reader(os.path.join(filepath, filename),
 #     signal_idx=["PLETH", "IR_ADC"],
 #     timestamp_idx=["TIMESTAMP_MS"],
 #     info_idx=["SPO2_PCT","PULSE_BPM","PERFUSION_INDEX"], sampling_rate=sampling_rate)



 print(data)
 print(data.PLETH)
 print(data.IR_ADC)
 print(data.TIMESTAMP_MS)
 print(data.SPO2_PCT)

 signals = data[["PLETH", "IR_ADC"]]

 plot_range = np.arange(0,1000,1)
 fig, ax = plt.subplots()
 ax.plot(plot_range, signals.iloc[0:1000, 0])

Out:

         TIMESTAMP_MS  COUNTER  DEVICE_ID  PULSE_BPM  SPO2_PCT  SPO2_STATUS  PLETH  BATTERY_PCT  RED_ADC  IR_ADC  PERFUSION_INDEX
0                   0        9          1         82      97.1            0  36033        100.0   144180  216524              4.4
1                  10       10          1         82      97.1            0  40919        100.0   144160  216470              4.4
2                  20       11          1         82      97.1            0  45715        100.0   144158  216415              4.4
3                  30       12          1         82      97.1            0  50227        100.0   144148  216364              4.4
4                  40       13          1         82      97.1            0  54362        100.0   144132  216379              4.4
...               ...      ...        ...        ...       ...          ...    ...          ...      ...     ...              ...
5626372      56269900  5626999          1         78      91.4            0  31408          5.7   292213  183755              5.6
5626373      56269910  5627000          1         78      91.4            0  31318          5.7   292557  184002              5.6
5626374      56269920  5627001          1         78      91.4            0  31117          5.7   292840  184296              5.6
5626375      56269930  5627002          1         78      91.4            0  30761          5.7   293161  184554              5.6
5626376      56269940  5627003          1         78      91.4            0  30261          5.7   293522  184828              5.6

[5626377 rows x 11 columns]
0          36033
1          40919
2          45715
3          50227
4          54362
           ...
5626372    31408
5626373    31318
5626374    31117
5626375    30761
5626376    30261
Name: PLETH, Length: 5626377, dtype: int64
0          216524
1          216470
2          216415
3          216364
4          216379
            ...
5626372    183755
5626373    184002
5626374    184296
5626375    184554
5626376    184828
Name: IR_ADC, Length: 5626377, dtype: int64
0                 0
1                10
2                20
3                30
4                40
             ...
5626372    56269900
5626373    56269910
5626374    56269920
5626375    56269930
5626376    56269940
Name: TIMESTAMP_MS, Length: 5626377, dtype: int64
0          97.1
1          97.1
2          97.1
3          97.1
4          97.1
           ...
5626372    91.4
5626373    91.4
5626374    91.4
5626375    91.4
5626376    91.4
Name: SPO2_PCT, Length: 5626377, dtype: float64

[<matplotlib.lines.Line2D object at 0x000001C5BCEA4B50>]

fetching ppg_start time

 def find_event_ppg_time(Clinical,Patient):
     row = Clinical[(Clinical.study_no == Patient) & (Clinical.column == 'event_ppg') & (Clinical.result == 'True')]
     return row['date'].values

Setting the indexes for the raw data - adopted from examples

 pd.Timedelta.__str__ = lambda x: x._repr_base('all')

 #Transposing data.signals to set into columns
 #signals = pd.DataFrame(data.signals.T)

 # Include column with index
 signals = signals.reset_index()

 signals['timedelta'] = pd.to_timedelta(data.TIMESTAMP_MS, unit='ms')

 PPG_start_date = find_event_ppg_time(Clinical, filename.split()[0][-8:])

 PPG_start_date = datetime.strptime(PPG_start_date[0], '%Y-%m-%d %H:%M:%S')

 signals['PPG_Datetime'] = pd.to_datetime(PPG_start_date)
 signals['PPG_Datetime']+= pd.to_timedelta(signals.timedelta)

 # Set the timedelta index (keep numeric index too)
 signals = signals.set_index('timedelta')

 # Rename column to avoid confusion
 signals = signals.rename(columns={'index': 'idx'})

 print("Raw Signals:")
 print(signals)

 #Plotting
 fig, axes = plt.subplots(nrows=2, ncols=1)
 axes = axes.flatten()
 signals.iloc[:,1].plot(ax=axes[0])
 signals.iloc[:,2].plot(ax=axes[1])

Out:

Raw Signals:
                            idx  PLETH  IR_ADC            PPG_Datetime
timedelta
0 days 00:00:00               0  36033  216524 2020-06-04 09:05:00.000
0 days 00:00:00.010000        1  40919  216470 2020-06-04 09:05:00.010
0 days 00:00:00.020000        2  45715  216415 2020-06-04 09:05:00.020
0 days 00:00:00.030000        3  50227  216364 2020-06-04 09:05:00.030
0 days 00:00:00.040000        4  54362  216379 2020-06-04 09:05:00.040
...                         ...    ...     ...                     ...
0 days 15:37:49.900000  5626372  31408  183755 2020-06-05 00:42:49.900
0 days 15:37:49.910000  5626373  31318  184002 2020-06-05 00:42:49.910
0 days 15:37:49.920000  5626374  31117  184296 2020-06-05 00:42:49.920
0 days 15:37:49.930000  5626375  30761  184554 2020-06-05 00:42:49.930
0 days 15:37:49.940000  5626376  30261  184828 2020-06-05 00:42:49.940

[5626377 rows x 4 columns]

<AxesSubplot:xlabel='timedelta'>

Trimming the first and last 5 minutes

 #Defining the 5 minute offset
 offset = pd.Timedelta(minutes=5)

 #Indexes - adopted from example
 idxs = (signals.index > offset) & \
         (signals.index < signals.index[-1] - offset)

 #Trimming
 signals = signals[idxs]

 print(signals)

Out:

                            idx  PLETH  IR_ADC            PPG_Datetime
timedelta
days 00:05:00.010000    29913  21755  188656 2020-06-04 09:10:00.010
days 00:05:00.020000    29914  23262  187701 2020-06-04 09:10:00.020
days 00:05:00.030000    29915  25285  186998 2020-06-04 09:10:00.030
days 00:05:00.040000    29916  27839  186472 2020-06-04 09:10:00.040
days 00:05:00.050000    29917  30888  186113 2020-06-04 09:10:00.050
...                         ...    ...     ...                     ...
days 15:32:49.890000  5596371  15083  186186 2020-06-05 00:37:49.890
days 15:32:49.900000  5596372  13923  186422 2020-06-05 00:37:49.900
days 15:32:49.910000  5596373  12818  186662 2020-06-05 00:37:49.910
days 15:32:49.920000  5596374  11769  186807 2020-06-05 00:37:49.920
days 15:32:49.930000  5596375  10763  186899 2020-06-05 00:37:49.930

[5566463 rows x 4 columns]

 # RESAMPLE??
 # MISSING DATA INPUT??
 # TAPPERING??

Band-pass filtering

 '''
 Band-pass filtering can be adjusted, maybe use a lower low-pass
 cutoff frequency @15-17Hz

 '''
 #Band pass filtering function
 def bpf(signal):
     hp_filt_params = (1, 1) #(Cutoff Hz, order)
     lp_filt_params = (20, 4) #( Cutoff Hz, order)
     sampling_rate = 100
     filter = BandpassFilter(band_type='butter', fs=sampling_rate)
     filtered_signal = filter.signal_highpass_filter(signal, cutoff=hp_filt_params[0], order=hp_filt_params[1])
     filtered_signal = filter.signal_lowpass_filter(filtered_signal, cutoff=lp_filt_params[0], order=lp_filt_params[1])
     return filtered_signal

 #Band-pass filtered signals
 signals['PLETH_bpf'] = bpf(signals.iloc[:,1])
 signals['IR_ADC_bpf'] = bpf(signals.iloc[:,2])

 print('Raw and Filtered Signals:')
 print(signals)

 fig, axes = plt.subplots(nrows=2, ncols=1)
 axes = axes.flatten()
 signals['PLETH_bpf'].plot(ax=axes[0])
 signals['IR_ADC_bpf'].plot(ax=axes[1])

Out:

Raw and Filtered Signals:
                            idx  PLETH  IR_ADC            PPG_Datetime    PLETH_bpf   IR_ADC_bpf
timedelta
0 days 00:05:00.010000    29913  21755  188656 2020-06-04 09:10:00.010  -214.886697   -32.494545
0 days 00:05:00.020000    29914  23262  187701 2020-06-04 09:10:00.020 -1224.999347  -226.831554
0 days 00:05:00.030000    29915  25285  186998 2020-06-04 09:10:00.030 -3047.445501  -734.950280
0 days 00:05:00.040000    29916  27839  186472 2020-06-04 09:10:00.040 -4351.721824 -1499.309485
0 days 00:05:00.050000    29917  30888  186113 2020-06-04 09:10:00.050 -3863.561589 -2230.015638
...                         ...    ...     ...                     ...          ...          ...
0 days 15:32:49.890000  5596371  15083  186186 2020-06-05 00:37:49.890   942.030869    45.325519
0 days 15:32:49.900000  5596372  13923  186422 2020-06-05 00:37:49.900   708.195771    93.296871
0 days 15:32:49.910000  5596373  12818  186662 2020-06-05 00:37:49.910   542.754581   127.077955
0 days 15:32:49.920000  5596374  11769  186807 2020-06-05 00:37:49.920   442.131641   151.598399
0 days 15:32:49.930000  5596375  10763  186899 2020-06-05 00:37:49.930   402.889073   159.679506

[5566463 rows x 6 columns]

<AxesSubplot:xlabel='timedelta'>

SQI Functions Definition and Peak Detection

 def PeakDetection(x):
     detector = PeakDetector()
     peaks, troughs = detector.ppg_detector(x, 7) #6 and 7 work the best
     return peaks, troughs

 #Defining SQI Functions

 def snr(x):
     #Signal to noise ratio
     #needs raw signal
     return np.mean(sq.standard_sqi.signal_to_noise_sqi(x))

 def zcr(x):
     #Zero Crossing Rate
     #Needs filtered signal
     return 0.5 * np.mean(np.abs(np.diff(np.sign(x))))

 def mcr(x):
     #Mean Crossing Rate
     #needs raw signal
     return zcr(x - np.mean(x))

 def perfusion(x,y):
     return (np.max(y) - np.min(y)) / np.abs(np.mean(x)) * 100

 def correlogram(x):
     #Correlogram
     #Needs filtered signal
     return sq.rpeaks_sqi.correlogram_sqi(x)

 #Calculating peaks and troughs using peakdetector function for both filtered signals
 peak_list_0, trough_list_0 = PeakDetection(signals['PLETH_bpf'])
 peak_list_1, trough_list_1 = PeakDetection(signals['IR_ADC_bpf'])

 '''
 MSW values that are being calculated are too small, maybe peakdetectors need
 to be revisited?
 '''
 def msq(x,peaks):
     #Dynamic Time Warping
     #Requires Filtered Signal and peaks
      # Return
      return sq.standard_sqi.msq_sqi(x, peaks_1=peaks, peak_detect2=6)

 '''
 Below function is outputting an error, needs revisiting
 '''
 # def dtw(x,troughs):
 #     #Dynamic Time Warping
 #     #Requires Filtered Signal and troughs and template selection 0-3 (0-2 for PPG)
 #     # Per beat
 #     dtw_list = sq.standard_sqi.per_beat_sqi(sqi_func=sq.dtw_sqi, troughs=troughs, signal=x, taper=True, template_type=1)
 #     # Return mean and standard deviation
 #     return [np.mean(dtw_list), np.std(dtw_list)]

Out:

'\nBelow function is outputting an error, needs revisiting\n'

Function Computing SQIs

 def sqi_all(x,raw,filtered,peaks,troughs):

     #Computing  all SQIs using the functions above

     # Information
     dinfo = {
         'PPG_w_s': x.PPG_Datetime.iloc[0],
         'PPG_w_f': x.PPG_Datetime.iloc[-1],
         'first': x.idx.iloc[0],
         'last': x.idx.iloc[-1],
         'skew': skew(x[raw]),
         'entropy': entropy(x[raw]),
         'kurtosis': kurtosis(x[raw]),
         'snr': snr(x[raw]),
         'mcr': mcr(x[raw]),
         'zcr': zcr(x[filtered]),
         'msq': msq(x[filtered],peaks),
         'perfusion': perfusion(x[raw], x[filtered]),
         'correlogram': correlogram(x[filtered]),
         #'dtw': dtw(x[filtered],troughs)
     }
     '''
     When using dtw an empty signal error is presented that
     needs fixing, not sure why it does that
     '''
     # Return
     return pd.Series(dinfo)

Calculating SQIs for both signals 0 = Pleth and 1 = IR_ADC

 sqis_0 = signals.groupby(pd.Grouper(freq='30s')).apply(lambda x: sqi_all(x, 'PLETH','PLETH_bpf', peak_list_0, trough_list_0))
 sqis_1 = signals.groupby(pd.Grouper(freq='30s')).apply(lambda x: sqi_all(x, 'IR_ADC','IR_ADC_bpf', peak_list_1, trough_list_1))

 #Displaying the SQIs per signal
 print("SQIs Signal 0 (Pleth):")
 sqis_0
 print(sqis_0)

 print("SQIs Signal 1 (IR_ADC):")
 sqis_1
 print(sqis_1)
 sqis_1 = sqis_1.drop(['first','last', 'PPG_w_s', 'PPG_w_f'], axis = 1)


 # Add window id to identify point of shock more easily
 sqis_1['w'] = np.arange(sqis_1.shape[0])

Out:

d:\files\desktop\dissertation icl\env\lib\site-packages\statsmodels\tsa\stattools.py:667: FutureWarning: fft=True will become the default after the release of the 0.12 release of statsmodels. To suppress this warning, explicitly set fft=False.
  warnings.warn(
SQIs Signal 0 (Pleth):
                                       PPG_w_s                 PPG_w_f    first  ...       msq   perfusion                                        correlogram
timedelta                                                                        ...
0 days 00:05:00.010000 2020-06-04 09:10:00.010 2020-06-04 09:10:30.000    29913  ...  0.000438  131.768567  [75, 150, 225, 0.9247323846304137, 0.836659565...
0 days 00:05:30.010000 2020-06-04 09:10:30.010 2020-06-04 09:11:00.000    32913  ...  0.000037  129.721286  [74, 148, 222, 0.9374430072923502, 0.866928327...
0 days 00:06:00.010000 2020-06-04 09:11:00.010 2020-06-04 09:11:30.000    35913  ...  0.000000  135.690506  [75, 150, 225, 0.9244827555340438, 0.814277364...
0 days 00:06:30.010000 2020-06-04 09:11:30.010 2020-06-04 09:12:00.000    38913  ...  0.000007  136.156223  [75, 150, 225, 0.9386014677175014, 0.862369440...
0 days 00:07:00.010000 2020-06-04 09:12:00.010 2020-06-04 09:12:30.000    41913  ...  0.000000  139.046179  [74, 149, 224, 0.9313743004418997, 0.824739899...
...                                        ...                     ...      ...  ...       ...         ...                                                ...
0 days 15:30:30.010000 2020-06-05 00:35:30.010 2020-06-05 00:36:00.000  5582383  ...  0.000000  148.508917  [37, 74, 110, -0.3912006306694429, 0.952924647...
0 days 15:31:00.010000 2020-06-05 00:36:00.010 2020-06-05 00:36:30.000  5585383  ...  0.000022  138.961305  [37, 75, 112, -0.38664616039771205, 0.95837555...
0 days 15:31:30.010000 2020-06-05 00:36:30.010 2020-06-05 00:37:00.000  5588383  ...  0.000015  125.616461  [77, 153, 230, 0.9462808953640326, 0.887181765...
0 days 15:32:00.010000 2020-06-05 00:37:00.010 2020-06-05 00:37:30.000  5591383  ...  0.000015  135.269077  [36, 74, 110, -0.37596691904957935, 0.93875374...
0 days 15:32:30.010000 2020-06-05 00:37:30.010 2020-06-05 00:37:49.930  5594383  ...  0.000022  134.870996  [37, 75, 112, -0.36633265827700123, 0.94094953...

[1856 rows x 13 columns]
SQIs Signal 1 (IR_ADC):
                                       PPG_w_s                 PPG_w_f    first  ...       msq  perfusion                                        correlogram
timedelta                                                                        ...
0 days 00:05:00.010000 2020-06-04 09:10:00.010 2020-06-04 09:10:30.000    29913  ...  0.000319   6.653209  [75, 150, 226, 0.9132608947040779, 0.799647163...
0 days 00:05:30.010000 2020-06-04 09:10:30.010 2020-06-04 09:11:00.000    32913  ...  0.000013   5.641682  [74, 148, 222, 0.9164960001123641, 0.827303342...
0 days 00:06:00.010000 2020-06-04 09:11:00.010 2020-06-04 09:11:30.000    35913  ...  0.000008   6.088179  [75, 150, 225, 0.9097936306566979, 0.779681593...
0 days 00:06:30.010000 2020-06-04 09:11:30.010 2020-06-04 09:12:00.000    38913  ...  0.000013   4.644686  [75, 149, 224, 0.9128541336509863, 0.807913379...
0 days 00:07:00.010000 2020-06-04 09:12:00.010 2020-06-04 09:12:30.000    41913  ...  0.000000   4.241945  [74, 149, 223, 0.9123059092032415, 0.791340070...
...                                        ...                     ...      ...  ...       ...        ...                                                ...
0 days 15:30:30.010000 2020-06-05 00:35:30.010 2020-06-05 00:36:00.000  5582383  ...  0.000013   4.946963  [34, 74, 107, -0.38922458037787383, 0.95155578...
0 days 15:31:00.010000 2020-06-05 00:36:00.010 2020-06-05 00:36:30.000  5585383  ...  0.000000   4.268080  [75, 150, 226, 0.964216933015672, 0.9155762504...
0 days 15:31:30.010000 2020-06-05 00:36:30.010 2020-06-05 00:37:00.000  5588383  ...  0.000013   4.733346  [77, 153, 230, 0.9423859574341878, 0.877431048...
0 days 15:32:00.010000 2020-06-05 00:37:00.010 2020-06-05 00:37:30.000  5591383  ...  0.000008   4.645047  [74, 103, 148, 0.9269084193473138, -0.35288054...
0 days 15:32:30.010000 2020-06-05 00:37:30.010 2020-06-05 00:37:49.930  5594383  ...  0.000008   4.352384  [75, 150, 226, 0.9326593121930683, 0.859995140...

[1856 rows x 13 columns]

Final Formatting into a single DataFrame and Saving into a .csv file

 #Merging SQIs of different signals from the same record (Pleth and IR_ADC)
 Signal_SQIs = sqis_0.merge(sqis_1, on='timedelta', suffixes=('_0', '_1'))

 #Automatically fetching the study_no from the filename and including it in the DataFrame
 Signal_SQIs['study_no'] = filename.split()[0][-8:]
 print(Signal_SQIs)

 # Saving the SQIs to a CSV file
 Signal_SQIs.to_csv(r'..\..\..\..\OUCRU\Outputs\SQI.csv')

Out:

                                       PPG_w_s                 PPG_w_f    first  ...                                      correlogram_1     w  study_no
timedelta                                                                        ...
days 00:05:00.010000 2020-06-04 09:10:00.010 2020-06-04 09:10:30.000    29913  ...  [75, 150, 226, 0.9132608947040779, 0.799647163...     0  003-2001
days 00:05:30.010000 2020-06-04 09:10:30.010 2020-06-04 09:11:00.000    32913  ...  [74, 148, 222, 0.9164960001123641, 0.827303342...     1  003-2001
days 00:06:00.010000 2020-06-04 09:11:00.010 2020-06-04 09:11:30.000    35913  ...  [75, 150, 225, 0.9097936306566979, 0.779681593...     2  003-2001
days 00:06:30.010000 2020-06-04 09:11:30.010 2020-06-04 09:12:00.000    38913  ...  [75, 149, 224, 0.9128541336509863, 0.807913379...     3  003-2001
days 00:07:00.010000 2020-06-04 09:12:00.010 2020-06-04 09:12:30.000    41913  ...  [74, 149, 223, 0.9123059092032415, 0.791340070...     4  003-2001
...                                        ...                     ...      ...  ...                                                ...   ...       ...
days 15:30:30.010000 2020-06-05 00:35:30.010 2020-06-05 00:36:00.000  5582383  ...  [34, 74, 107, -0.38922458037787383, 0.95155578...  1851  003-2001
days 15:31:00.010000 2020-06-05 00:36:00.010 2020-06-05 00:36:30.000  5585383  ...  [75, 150, 226, 0.964216933015672, 0.9155762504...  1852  003-2001
days 15:31:30.010000 2020-06-05 00:36:30.010 2020-06-05 00:37:00.000  5588383  ...  [77, 153, 230, 0.9423859574341878, 0.877431048...  1853  003-2001
days 15:32:00.010000 2020-06-05 00:37:00.010 2020-06-05 00:37:30.000  5591383  ...  [74, 103, 148, 0.9269084193473138, -0.35288054...  1854  003-2001
days 15:32:30.010000 2020-06-05 00:37:30.010 2020-06-05 00:37:49.930  5594383  ...  [75, 150, 226, 0.9326593121930683, 0.859995140...  1855  003-2001

[1856 rows x 24 columns]

 '''
 #Running Some Abstract Checks to Verify Results etc.

 print(perfusion(signals[0][0:3000], signals['PLETH_bpf'][0:3000]))

 '''

Out:

"\n#Running Some Abstract Checks to Verify Results etc.\n\nprint(perfusion(signals[0][0:3000], signals['PLETH_bpf'][0:3000]))\n\n"

 '''
 Discarding Process to be implemented after quality check.
 '''

Out:

'\nDiscarding Process to be implemented after quality check.\n'

Total running time of the script: ( 1 minutes 59.605 seconds)

Gallery generated by Sphinx-Gallery