importing files and libraries
import pandas as pd
EDC = 'EDC_TC.xls.xlsx'
CCC = 'CCC_TC.xlsx'
edc_df = pd.read_excel(EDC)
ccc_df = pd.read_excel(CCC)
print(edc_df.head(10))
print(ccc_df.head(10))
Site Subject Visit Form Seq No Complaint Sample
Code \
0 123 12345 Complaint Complaint [1] 1
Drug/Placebo
1 120 12047 Complaint Complaint [1] 1
Drug/Placebo
2 120 12047 Complaint Complaint [1] 3
Drug/Placebo
3 128 12871 Complaint Complaint [1] 1
Drug/Placebo
4 110 11051 Complaint Complaint [1] 1
Drug/Placebo
5 110 11051 Complaint Complaint [1] 2
Drug/Placebo
6 110 11051 Complaint Complaint [1] 3
Drug/Placebo
Batch No Batch ID Dispense Unit Number ID Dispense Unit Number ID2
\
0 ABC/123 NaN 987654 DUN_ID_2
1 ABC/123 NaN 987670 DUN_ID_2
2 ABC/123 NaN 987677 DUN_ID_2
3 ABC/123 NaN 981655 DUN_ID_2
4 ABC/123 NaN 978234 DUN_ID_2
5 ABC/123 NaN 978235 DUN_ID_2
6 ABC/123 NaN 978236 DUN_ID_2
Onset date Onset date 1 AE related
0 2024-01-01 01jan24 No
1 2024-01-01 01jan24 YES
2 2024-01-10 01jan24 YES
3 2024-07-15 07jan24 No
4 2024-01-01 01jan24 No
5 2024-02-01 01feb24 No
6 2024-03-01 01mar24 No
Trial/Study Type Trial/Study Number Center/Site Subject/Patient ID
\
0 Clinical Trial AB1234 123 12345
1 Clinical Trial AB1234 120 12047
2 Clinical Trial AB1234 120 12047
3 Clinical Trial AB1234 110 11054
4 Clinical Trial AB1234 110 11051
5 Clinical Trial AB1234 110 11051
6 Clinical Trial AB1234 110 11051
7 Clinical Trial AB1234 119 11958
8 Clinical Trial AB1234 119 11958
Technical Complaint No. Complaint No. AE related DUN Number \
0 1 CC-12345 No 987654
1 1 CC-12041 No 987670
2 2 CC-12042 No 987677
3 1 CC-12043 No 978233
4 1 CC-12145 No 978234
5 2 CC-12565 No 978235
6 3 CC-21345 No 978236
7 2 CC-14567 Yes 985648
8 1 CC-14568 No 985649
TC led to an SAE
0 No
1 No
2 YES
3 No
4 No
5 No
6 No
7 Yes
8 No
edc_df = pd.read_excel(EDC,usecols=['Subject', 'Seq No', 'Dispense
Unit Number ID', 'AE related'])
ccc_df = pd.read_excel(CCC,usecols=['Subject/Patient ID', 'Technical
Complaint No.', 'DUN Number', 'AE related'])
print(edc_df.head(10))
print(ccc_df.head(10))
print(edc_df.shape)
print(ccc_df.shape)
Subject Seq No Dispense Unit Number ID AE related
0 12345 1 987654 No
1 12047 1 987670 YES
2 12047 3 987677 YES
3 12871 1 981655 No
4 11051 1 978234 No
5 11051 2 978235 No
6 11051 3 978236 No
Subject/Patient ID Technical Complaint No. AE related DUN Number
0 12345 1 No 987654
1 12047 1 No 987670
2 12047 2 No 987677
3 11054 1 No 978233
4 11051 1 No 978234
5 11051 2 No 978235
6 11051 3 No 978236
7 11958 2 Yes 985648
8 11958 1 No 985649
(7, 4)
(9, 4)
start the indexing from 1
edc_df.index += 1
ccc_df.index += 1
print(edc_df)
print(ccc_df)
Subject Seq No Dispense Unit Number ID AE related
1 12345 1 987654 No
2 12047 1 987670 YES
3 12047 3 987677 YES
4 12871 1 981655 No
5 11051 1 978234 No
6 11051 2 978235 No
7 11051 3 978236 No
Subject/Patient ID Technical Complaint No. AE related DUN Number
1 12345 1 No 987654
2 12047 1 No 987670
3 12047 2 No 987677
4 11054 1 No 978233
5 11051 1 No 978234
6 11051 2 No 978235
7 11051 3 No 978236
8 11958 2 Yes 985648
9 11958 1 No 985649
Problem No. 1:
Check whether all the records match each other
if not discrepancies are present
def check_records_match(edc, ccc):
discrepancies = []
for i, edc_row in edc.iterrows():
matched = False
for j, ccc_row in ccc.iterrows():
if edc_row['Subject'] == ccc_row['Subject/Patient ID'] and
\
edc_row['Dispense Unit Number ID'] == ccc_row['DUN
Number']:
matched = True
if edc_row['AE related'].lower() != ccc_row['AE
related'].lower():
discrepancies.append((i, 'AE related', edc_row['AE
related'], ccc_row['AE related']))
if not matched:
discrepancies.append((i, 'Record not found', 'EDC record
not found in CCC'))
for j, ccc_row in ccc.iterrows():
matched = False
for i, edc_row in edc.iterrows():
if edc_row['Subject'] == ccc_row['Subject/Patient ID'] and
\
edc_row['Dispense Unit Number ID'] == ccc_row['DUN
Number']:
matched = True
if not matched:
discrepancies.append((j, 'Record not found', 'CCC record
not found in EDC'))
return discrepancies
#execccution
discrepancies = check_records_match(edc_df, ccc_df)
print("Discrepancies:")
for discrepancy in discrepancies:
print(discrepancy)
Discrepancies:
(2, 'AE related', 'YES', 'No')
(3, 'AE related', 'YES', 'No')
(4, 'Record not found', 'EDC record not found in CCC')
(4, 'Record not found', 'CCC record not found in EDC')
(8, 'Record not found', 'CCC record not found in EDC')
(9, 'Record not found', 'CCC record not found in EDC')
Problem No. 2:
Check the total number of records in the EDC
File are equal to the CCC File.
def check_record_counts(edc, ccc):
len_edc = len(edc)
len_ccc = len(ccc)
if len_edc != len_ccc:
return "Not Equal\n" f"Number of records in EDC: {len_edc},
Number of records in CCC: {len_ccc}"
return "The number of records in both tables are equal."
# execution
record_count_check = check_record_counts(edc_df, ccc_df)
print("\nRecord Count Check:")
print(record_count_check)
Record Count Check:
Not Equal
Number of records in EDC: 7, Number of records in CCC: 9
Problem No. 3:
Check if any record is missing from both the
tables where one record is present in EDC file
but not in CCC file and vice versa.
def check_missing_records(edc, ccc):
# Create dictionaries to map (Subject, Dispense Unit Number ID)
and (Subject/Patient ID, DUN Number) to their row indices
edc_index = {(row['Subject'], row['Dispense Unit Number ID']): idx
for idx, row in edc.iterrows()}
ccc_index = {(row['Subject/Patient ID'], row['DUN Number']): idx
for idx, row in ccc.iterrows()}
# Convert the keys of these dictionaries to sets for comparison
edc_ids = set(edc_index.keys())
ccc_ids = set(ccc_index.keys())
# Calculate missing records
missing_from_ccc = edc_ids - ccc_ids
missing_from_edc = ccc_ids - edc_ids
# Prepare results with row numbers
missing_from_ccc_with_indices = [(record, edc_index[record]) for
record in missing_from_ccc]
missing_from_edc_with_indices = [(record, ccc_index[record]) for
record in missing_from_edc]
return missing_from_edc_with_indices,
missing_from_ccc_with_indices
# Execution
missing_from_edc, missing_from_ccc = check_missing_records(edc_df,
ccc_df)
print("\nMissing Records from EDC (Row Number in EDC):")
for missing in missing_from_edc:
record, row_num = missing
print(f"Record: {record}, Row Number: {row_num}")
print("\nMissing Records from CCC (Row Number in CCC):")
for missing in missing_from_ccc:
record, row_num = missing
print(f"Record: {record}, Row Number: {row_num}")
Missing Records from EDC (Row Number in EDC):
Record: (11054, 978233), Row Number: 4
Record: (11958, 985648), Row Number: 8
Record: (11958, 985649), Row Number: 9
Missing Records from CCC (Row Number in CCC):
Record: (12871, 981655), Row Number: 4