import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")
df1=pd.read_excel(r"D:\SIP_OutlookGroup\Business Analytics module3-
1.xlsx")
df1
        MAG       SUBCODE MASTER_KEY       S DESIGNATION               CITY
STA \
0       OLT         81221         5148   NaN        OWNER           SURAT
GUJ
1        OL        477892         5150   NaN          IPS       KOZHIKODE
KER
2        OL        477990         5159   NaN       OWNDER       HASSAN DT
KAR
3       OLM        188077         5167     S          NaN       BANGALORE
KAR
4       OLH          8117         5168     S   CONSULTANT      NEW MUMBAI
MAH
...     ...           ...          ...   ...          ...               ...   ..
.
16516   OLM        235673     920684     NaN          NaN              AGRA
UP
16517   OLM        163195          NaN     C          NaN   SUNDERGARH DT
ORI
16518    OL        304471          NaN   NaN          NaN       NEW DELHI
DEL
16519   NaN           NaN          NaN   NaN          NaN               NaN
NaN
16520   165   20 rows sel     ected.     NaN          NaN               NaN
NaN
       REGIONCODE    NOOFCOPIES                    sdate     EXPDATE
LASTEXPDA
0             4.0           1.0    2018-08-01 00:00:00 2025-02-01 2025-02-
01
1             6.0           1.0    2022-03-11 00:00:00 2024-08-21 2024-08-
21
2             8.0           1.0    2022-03-14 00:00:00 2024-09-01 2024-09-
01
3             8.0           1.0    2019-03-01 00:00:00 2024-09-01 2024-09-
01
4             4.0           1.0    2022-03-07 00:00:00 2024-06-24 2024-06-
24
...           ...           ...                      ...         ...           .
..
16516         2.0           1.0    2024-11-01 00:00:00 2024-10-01 2024-10-
01
16517        2.0            1.0   2008-08-27 00:00:00 2013-08-07 2025-03-
01
16518        1.0            1.0   2003-03-22 00:00:00 2004-12-13 2024-12-
21
16519        NaN            NaN                   NaN        NaT
NaT
16520        NaN            NaN                   NaN        NaT
NaT
[16521 rows x 12 columns]
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16521 entries, 0 to 16520
Data columns (total 12 columns):
 #   Column       Non-Null Count Dtype
--- ------        -------------- -----
 0   MAG          16520 non-null object
 1   SUBCODE      16520 non-null object
 2   MASTER_KEY   16518 non-null object
 3   S            2712 non-null   object
 4   DESIGNATION 1995 non-null    object
 5   CITY         16517 non-null object
 6   STA          16519 non-null object
 7   REGIONCODE   16519 non-null float64
 8   NOOFCOPIES   16519 non-null float64
 9   sdate        16519 non-null object
 10 EXPDATE       16519 non-null datetime64[ns]
 11 LASTEXPDA     16519 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(2), object(8)
memory usage: 1.5+ MB
for i in range(0,11):
    c=df1.columns[i]
    print(df1.columns[i],":",df1[c].nunique())
MAG : 6
SUBCODE : 16442
MASTER_KEY : 15273
S : 28
DESIGNATION : 538
CITY : 1255
STA : 35
REGIONCODE : 12
NOOFCOPIES : 9
sdate : 541
EXPDATE : 198
df1['CITY'].unique()
array(['SURAT', 'KOZHIKODE', 'HASSAN DT', ..., 'NADIA', 'JAJPUR',
       'HAZARIBAGH'], dtype=object)
df1['MAG'].unique()
array(['OLT', 'OL', 'OLM', 'OLH', 'OLB', nan, 165], dtype=object)
valcount = df1['MAG'].value_counts(dropna=False)
valcount
OLM    4895
OLT    3898
OL     3345
OLB    3003
OLH    1378
NaN       1
165       1
Name: MAG, dtype: int64
 •   The five known brands (OLM, OLT, OL, OLB, OLH) make up almost all the data.
 •   165 is clearly an error
 •   One missing value is negligible but should be handled.
# Replace 165 with NaN or remove it if it's an error
df1['MAG'] = df1['MAG'].replace(165, pd.NA)
df1 = df1[df1['MAG'].notna()]
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16519 entries, 0 to 16518
Data columns (total 12 columns):
 #   Column       Non-Null Count Dtype
--- ------        -------------- -----
 0   MAG          16519 non-null object
 1   SUBCODE      16519 non-null object
 2   MASTER_KEY   16517 non-null object
 3   S            2712 non-null   object
 4   DESIGNATION 1995 non-null    object
 5   CITY         16517 non-null object
 6   STA          16519 non-null object
 7   REGIONCODE   16519 non-null float64
 8   NOOFCOPIES   16519 non-null float64
 9   sdate        16519 non-null object
 10 EXPDATE       16519 non-null datetime64[ns]
 11 LASTEXPDA     16519 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(2), object(8)
memory usage: 1.6+ MB
Dropping rows with null DESIGNATION would cause major data loss,
because only 1,995 non-null DESIGNATION entries out of 16,521 total
rows. THerefore categorizing them as Unknown
# Replace NaN values in the 'DESIGNATION' column with 'Unknown'
df1['DESIGNATION'] = df1['DESIGNATION'].fillna('Unknown')
# Remove direction suffixes (NORTH, SOUTH, EAST, WEST, CENTRAL)
import re
def normalize_city(city):
    if pd.isnull(city):
        return city
    return re.sub(r'\s+(NORTH|SOUTH|EAST|WEST|CENTRAL)\b', '', city,
flags=re.IGNORECASE).strip()
df1['CITY_CLEAN'] = df1['CITY'].apply(normalize_city)
# Convert 'sdate' to datetime
df1['sdate'] = pd.to_datetime(df1['sdate'], errors='coerce')
# Drop rows with invalid or missing dates
df1 = df1.dropna(subset=['sdate', 'EXPDATE'])
# Calculate tenure in days
df1['TENURE_DAYS'] = (df1['EXPDATE'] - df1['sdate']).dt.days
# Option 1: Tenure in months (approximate)
df1['TENURE_MONTHS'] = (df1['TENURE_DAYS'] / 30).round()
# Option 2: Tenure in years (approximate)
df1['TENURE_YEARS'] = (df1['TENURE_DAYS'] / 365).round(1)
df1[['sdate', 'EXPDATE', 'TENURE_DAYS', 'TENURE_MONTHS',
'TENURE_YEARS']].head()
         sdate        EXPDATE       TENURE_DAYS        TENURE_MONTHS         TENURE_YEARS
0   2018-08-01     2025-02-01              2376                 79.0                  6.5
1   2022-03-11     2024-08-21               894                 30.0                  2.4
2   2022-03-14     2024-09-01               902                 30.0                  2.5
3   2019-03-01     2024-09-01              2011                 67.0                  5.5
4   2022-03-07     2024-06-24               840                 28.0                  2.3
There is No clear pattern in column 'S' It contains single letters like 'K', 'S', 'R', 'M', etc. these
are not standard salutations or recognizable codes for gender or status. so choosing to drop
this column
# Drop the 'S' column
dfnew = df1.drop(columns=['S'])
dfnew.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16516 entries, 0 to 16518
Data columns (total 15 columns):
 #   Column         Non-Null Count Dtype
--- ------          -------------- -----
 0   MAG            16516 non-null object
 1   SUBCODE        16516 non-null object
 2   MASTER_KEY     16514 non-null object
 3   DESIGNATION    16516 non-null object
 4   CITY           16514 non-null object
 5   STA            16516 non-null object
 6   REGIONCODE     16516 non-null float64
 7   NOOFCOPIES     16516 non-null float64
 8   sdate          16516 non-null datetime64[ns]
 9   EXPDATE        16516 non-null datetime64[ns]
 10 LASTEXPDA       16516 non-null datetime64[ns]
 11 CITY_CLEAN      16514 non-null object
 12 TENURE_DAYS     16516 non-null int64
 13 TENURE_MONTHS 16516 non-null float64
 14 TENURE_YEARS    16516 non-null float64
dtypes: datetime64[ns](3), float64(4), int64(1), object(7)
memory usage: 2.0+ MB
print(max(dfnew['TENURE_YEARS']))
print(min(dfnew['TENURE_YEARS']))
21.8
-6.3
Encountered negative tenure values. A negative tenure clearly means
that the sdate (subscription start date) is after the EXPDATE (expiry
date), which is logically incorrect for a subscription.
# Find rows where tenure is negative
dfnew[dfnew['TENURE_DAYS'] < 0]
       MAG SUBCODE MASTER_KEY       DESIGNATION         CITY   STA
REGIONCODE \
234    OLM 190240        8892   BRANCH MANAGER       PANIPAT   HAR
2.0
489    OLT   50396      13082           Unknown    HYDERABAD    AP
6.0
788     OL 355127       17585           Unknown     CALCUTTA    WB
7.0
928    OLM 176666       20233           Unknown    BANGALORE   KAR
8.0
1083    OL 360607       23395           Unknown      GURGAON   HAR
2.0
...     ...      ...         ...                 ...              ...   ...
...
16512    OL   501361      920566           Unknown           PALAKKAD   KER
6.0
16513    OL   501362      920567           Unknown          BANGALORE   KAR
8.0
16514   OLM   235620      920575           Unknown          BARDHAWAN    WB
2.0
16515   OLB    61778      920645           Unknown     CHAMPARAN (W)    BIH
2.0
16516   OLM   235673      920684           Unknown               AGRA    UP
2.0
        NOOFCOPIES        sdate      EXPDATE    LASTEXPDA       CITY_CLEAN    \
234            1.0   2024-11-01   2024-05-01   2024-05-01          PANIPAT
489            1.0   2024-02-01   2020-09-01   2024-12-01        HYDERABAD
788            1.0   2011-09-11   2006-08-28   2024-10-21         CALCUTTA
928            1.0   2024-11-01   2024-10-01   2024-10-01        BANGALORE
1083           1.0   2011-03-03   2006-12-04   2024-06-01          GURGAON
...            ...          ...          ...          ...              ...
16512          1.0   2024-11-11   2024-11-01   2024-11-01         PALAKKAD
16513          1.0   2024-11-11   2024-11-01   2024-11-01        BANGALORE
16514          1.0   2024-11-01   2024-10-01   2024-10-01        BARDHAWAN
16515          1.0   2024-11-01   2024-10-01   2024-10-01    CHAMPARAN (W)
16516          1.0   2024-11-01   2024-10-01   2024-10-01             AGRA
        TENURE_DAYS    TENURE_MONTHS    TENURE_YEARS
234            -184             -6.0            -0.5
489           -1248            -42.0            -3.4
788           -1840            -61.0            -5.0
928             -31             -1.0            -0.1
1083          -1550            -52.0            -4.2
...             ...              ...             ...
16512           -10             -0.0            -0.0
16513           -10             -0.0            -0.0
16514           -31             -1.0            -0.1
16515           -31             -1.0            -0.1
16516           -31             -1.0            -0.1
[62 rows x 15 columns]
new=dfnew[dfnew['TENURE_DAYS'] >= 0]
new.dropna(inplace=True)
new.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16450 entries, 0 to 16477
Data columns (total 15 columns):
 #   Column         Non-Null Count Dtype
--- ------          -------------- -----
 0   MAG            16450 non-null object
 1   SUBCODE        16450 non-null object
 2   MASTER_KEY     16450 non-null object
 3   DESIGNATION    16450 non-null object
 4   CITY           16450 non-null object
 5   STA            16450 non-null object
 6   REGIONCODE     16450 non-null float64
 7   NOOFCOPIES     16450 non-null float64
 8   sdate          16450 non-null datetime64[ns]
 9   EXPDATE        16450 non-null datetime64[ns]
 10 LASTEXPDA       16450 non-null datetime64[ns]
 11 CITY_CLEAN      16450 non-null object
 12 TENURE_DAYS     16450 non-null int64
 13 TENURE_MONTHS 16450 non-null float64
 14 TENURE_YEARS    16450 non-null float64
dtypes: datetime64[ns](3), float64(4), int64(1), object(7)
memory usage: 2.0+ MB
new
        MAG SUBCODE MASTER_KEY DESIGNATION         CITY   STA   REGIONCODE
\
0       OLT    81221     5148        OWNER        SURAT   GUJ          4.0
1        OL   477892     5150          IPS    KOZHIKODE   KER          6.0
2        OL   477990     5159       OWNDER    HASSAN DT   KAR          8.0
3       OLM   188077     5167      Unknown    BANGALORE   KAR          8.0
4       OLH     8117     5168   CONSULTANT   NEW MUMBAI   MAH          4.0
...     ...      ...      ...          ...          ...   ...          ...
16473    OL   501283   919750      Unknown       RANCHI   BIH          2.0
16474    OL   501284   919751      Unknown     SHILLONG   MEG          2.0
16475    OL   501285   919752      Unknown    ANANTAPUR    AP          6.0
16476    OL   501286   919753      Unknown    BANGALORE   KAR          8.0
16477    OL   501287   919754      Unknown   HOOGHLY DT    WB          2.0
       NOOFCOPIES      sdate    EXPDATE LASTEXPDA     CITY_CLEAN
TENURE_DAYS \
0             1.0 2018-08-01 2025-02-01 2025-02-01          SURAT
2376
1             1.0 2022-03-11 2024-08-21 2024-08-21    KOZHIKODE
894
2             1.0 2022-03-14 2024-09-01 2024-09-01    HASSAN DT
902
3             1.0 2019-03-01 2024-09-01 2024-09-01    BANGALORE
2011
4             1.0 2022-03-07 2024-06-24 2024-06-24   NEW MUMBAI
840
...           ...          ...         ...     ...          ...
...
16473         1.0 2024-09-21 2024-10-11 2024-10-11       RANCHI
20
16474         1.0 2024-09-21 2024-10-11 2024-10-11     SHILLONG
20
16475         1.0 2024-09-21 2024-10-11 2024-10-11    ANANTAPUR
20
16476         1.0 2024-09-21 2024-10-11 2024-10-11    BANGALORE
20
16477         1.0 2024-09-21 2024-10-11 2024-10-11   HOOGHLY DT
20
        TENURE_MONTHS   TENURE_YEARS
0                79.0            6.5
1                30.0            2.4
2                30.0            2.5
3                67.0            5.5
4                28.0            2.3
...               ...            ...
16473             1.0            0.1
16474             1.0            0.1
16475             1.0            0.1
16476             1.0            0.1
16477             1.0            0.1
[16450 rows x 15 columns]
max(new['TENURE_YEARS'])
21.8
min(new['TENURE_YEARS'])
0.0
def categorize_tenure(years):
    if years < 1:
        return '< 1 Year'
    elif years < 3:
        return '1–2 Years'
    elif years < 5:
        return   '3–4 Years'
    elif years   < 10:
        return   '5–9 Years'
    elif years   < 15:
        return   '10–14 Years'
    else:
        return   '15+ Years'
# Apply to the dataframe
new['TENURE_BIN'] = new['TENURE_YEARS'].apply(categorize_tenure)
new.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16450 entries, 0 to 16477
Data columns (total 16 columns):
 #   Column         Non-Null Count Dtype
--- ------          -------------- -----
 0   MAG            16450 non-null object
 1   SUBCODE        16450 non-null object
 2   MASTER_KEY     16450 non-null object
 3   DESIGNATION    16450 non-null object
 4   CITY           16450 non-null object
 5   STA            16450 non-null object
 6   REGIONCODE     16450 non-null float64
 7   NOOFCOPIES     16450 non-null float64
 8   sdate          16450 non-null datetime64[ns]
 9   EXPDATE        16450 non-null datetime64[ns]
 10 LASTEXPDA       16450 non-null datetime64[ns]
 11 CITY_CLEAN      16450 non-null object
 12 TENURE_DAYS     16450 non-null int64
 13 TENURE_MONTHS 16450 non-null float64
 14 TENURE_YEARS    16450 non-null float64
 15 TENURE_BIN      16450 non-null object
dtypes: datetime64[ns](3), float64(4), int64(1), object(8)
memory usage: 2.1+ MB
def categorize_tenure(months):
    if months < 12:
        return '< 12 moths'
    elif months < 36:
        return '12–24 months'
    elif months < 60:
        return '36–48 months'
    elif months < 120:
        return '60–108 months'
    elif months < 180:
        return '120–168 months'
    else:
        return '180+ months'
# Apply to the dataframe
new['Month_bin'] = new['TENURE_MONTHS'].apply(categorize_tenure)
max(new['TENURE_MONTHS'])
266.0
min(new['TENURE_MONTHS'])
0.0
new.to_excel('final_outlook_m3.xlsx', index=False)
new['DESIGNATION'].unique()
array(['OWNER', 'IPS', 'OWNDER', 'Unknown', 'CONSULTANT', 'RETD',
'PROF',
        'RETD COLONEL', 'PROPRIETOR', 'ADDL ASST ENGR', 'PARTNER',
        'CHIEF INSTRUCTOR', 'DIRECTOR', 'STUDENT', 'MANAGER',
'TEACHER',
        'PROP', 'DOCTOR', 'PO', 'D M', 'SR CONSULTANT', 'SR MANAGER',
        'SR ADVOCATE', 'S A O', 'B M', 'TOUR MANAGER', 'VICE
PRESIDENT',
        'G M', 'CGM', 'COLONEL', 'ENGINEER', 'COO', 'PRESIDENT',
        'NEWS EDITOR', 'MEDICAL SPECIALIST', 'PRINCIPAL',
        'SR HR EXECUTIVE', 'DGM', 'FIELD ENGR', 'ASST DIRECTOR',
        'JOURNALIST', 'ARCHITECT', 'CFO', 'V P', 'C A', 'MEDICAL
OFFICER',
        'DY GENERAL MANAGER', 'SR. VICE PRESIDENT', 'SR.SALES.ENGR.',
        'GEN PRACTICE', 'SR FIN MANAGER', 'AGM', 'DEAN', 'SR V P',
'CHIEF',
        'EDITOR', 'OFFICER (OPERATIONS SAFETY)', 'PROJ MANAGER',
'ADVISOR',
        'DERMATOLOGIST', 'BUSINESS', 'SR.MAN.', 'A V P', 'SELF EMPLYD',
        'ADVOCATE', 'CEO', 'SSA', 'D C P', 'PROFESSOR', 'SC OFFICER',
        'ASST COMMISSIONER OF INCOME TAX', 'LABOUR OFFICER',
        'CHOEF ENGINEER', 'ASSOCIATE EDITOR', 'BANK MANAGER', 'MDS',
'GM',
        'CHIEF EXEC BUSINESS', 'SUPERINTENDENT', 'COUNTRY MANAGER',
        'SPECIAL VIGILANCE JUDGE', 'V P FINANCE', 'JT G M', 'RETD
BRIG',
        'MANAGER-ADMINISTRATION', 'SMO', 'HYDROGEOLOGIST',
        'PROFFESSIIONAL', 'ASSISTANT PROFESSOR', 'DY S E (PRODN)',
        'SCIENTIST', 'PATAWARI', 'ASSISTANT', 'SURGEON', 'SERVICE',
        'ASSOCIATE PROFESSOR', 'MANAGING DIRECTOR', 'SR ASST ENGINEER',
        'M D', 'DY LIBARIAN', 'DY LIBRARIAN', 'POLITICIAN',
        'ASSOCIATE PROF', 'C G M', 'EX C M S', 'SR S/W ENGR',
'EXECUTIVE',
        'TECH CONSULTANT', 'SDM', 'CHIEF MANAAGER', 'PENSIONER',
        'ASTT ENGR', 'ENGR', 'DY GEN MGR', 'EXECUTIVE ENGINEER',
        'RETD DGM', 'OPERATIONS MANAGER', 'HEAD', 'READER IN BANKING',
        'INSPECTOR', 'CARDIOLOGIST', 'HEAD-AES', 'MANAGER SALES',
        'DY MANAGER', 'A M', 'CA', 'ORTHODONTIST', 'ASST MGR',
        'PROJ-COORDINATOR', 'PHYSICIAN', 'COFFEE PLANTER', 'PROFFESOR',
        'CONSULTANT (PAED)', 'SR EXECUTIVE', 'ELECTRIC ENGINEER',
        'HEAD MASTER', 'PROFESSIONAL', 'EXE DIRECTOR', 'SR DIRECTOR',
        'SR GEN MGR', 'ASTT MANAGER', 'AGRICULTURE', 'CONTROLLER',
'PROFE',
        'DY CHEIF SALES MANAGER', 'DY E E', 'SPEL ASSTT', 'LT COL',
        'DY PROJ MGR', 'ASST ACCOUNTS OFFICER', 'COMMANDANT', 'A E',
        'ASISTANT VICE PRESIDENT', 'DEVELOPMENT OFFICER', 'JT MD',
        'MARKETING OFFICER', 'CHIEF EDITOR', 'INSTITUTIONS',
        'SELF EMPLOYED', 'REPORTER', 'ENT SURGEON', 'OFFICER SURVEYOR',
        'LIBRARIAN', 'CMD', 'RETD PROFESSOR', 'PROFESSOR RTRD', 'S D
E',
        'AVP', 'AUDIT OFFICER', 'ASSTT DIRECTOR', 'SCI ENGR-SF',
        'CHAIRMAN', 'SDGM', 'S/C II', 'CDR', 'SALES MANAGER',
'OFFICER',
        'ASSOCIATES', 'NO', 'SR PROGRAM MGR', 'CHILD SPECIALIST',
'CLERK',
        'IAS RETD', 'EXEC CHIEF', 'VAS DOCTOR', 'HOUSE WIFE',
        'GYNECLOGIST', 'DY D G', 'MANAGER HR', 'NEUROLOGIST', 'NSE
DEALER',
        'SR LIBRARIAN', 'MD & CEO', 'DIVISIONAL MANAGER', 'SR MGR',
        'SR B A', 'MANAGER (NB)', 'COL', 'SR ENGINEER SALES',
        'MANAGING PARTNER', 'SENIOR CONSULTANT', 'DENTIST', 'SR ENGR',
        'TECH.OFFICER', 'BUSINESSMAN', 'S A', 'SECRETARY',
        'EXECUTIVE EDITOR', 'ADMINISTRATOR', '(LIBRARIAN)', 'JT
MANAGER',
        'S V P', 'SCIENTIST D', 'SQN CDR', 'RETD SBI OFFICER', 'JT A V
P',
        'DY ACCOUNTANT', 'SR ANALYST', 'SUPERIOR & CORRESPODENT',
        'OFFICER COMMANDING', 'MAJ GEN', 'JR SUPDT', 'ASST EDITOR',
        'BANK OFFICER', 'HR MANAGER', 'AUDITOR', 'ACCTS OFFICER',
        'WRH MANAGER', 'EXE', 'SR MGR ADMIN', 'HOUSEWIFE', 'S O',
        'ASSTT MANAGER', 'SEC SO-H', 'AM', 'CHIEF ENGR', 'DY CHIEF
ENGR',
        'SUPERVISOR', 'DY MGR', 'R H', 'SECTION MANAGER',
        'SECTIONAL ELECTRICAL ENGR', 'GP CAPT', 'LECTURER', 'COAST
GUARD',
        'MAJ GENERAL', 'READER IN HISTORY', 'GENERAL SECRETARY', 'EVP',
        'ANALYST', 'DEPUTY MANAGER', 'INS COUNSELLOR', 'D G M',
        'PROGRAMMER', 'TECH ARCHITECT', 'S/W ENGR', 'ASSTT COST AQTT',
        'BANKER', 'SS', 'RETIRED', 'MGR', 'INVESTMENT CONS',
        'INSURANCE CONSULTANT', 'SUPDT', 'INSURANCE', 'SR CDR',
        'PROPRITOR', 'CHIEF RESEARCH ANALYST', 'SR DOCTOR',
        'COMPANY SECRETARY', 'CLUSTER BRANCH MANAGER', 'PRICE
WATERHOUSE',
        'GOVT SERVICE', 'SR SECTION OFFICER', 'CHIEF CONSULTANT',
       'ENGG ASSTT', 'A S M', 'P. A. ENQUIRY', 'DIRECTOR HR',
'CLEARK',
       'SR ASSISTANT', 'I T', 'ADM OFFICER', 'LT COLONEL', 'C M',
       'INSURANCE AGENT', 'RESOURCE MANAGER', 'ORTHO SURGEON',
       'SCIENTIFIC OFFICER', 'CASHIER', 'RADIOLOGIST', 'SALES
OFFICER',
       'MG PARTNER', 'RETD ASST DIRECTOR', 'PRODUCT MANAGER', 'SMC',
       'FCA', 'FIN ADVISOR', 'FINANCIAL PLANNER', 'C M E', 'RETD M P',
       'MARINE ENGR', 'OPER HEAD', 'R M', 'SUPDTT', 'DOCTOR MD',
       'MEMBER BOS', 'SR GENERAL MANAGER HR', 'ASST ENGR',
       'HEAD BIOTECHNOLOGY', 'DEPUTY EDITOR', 'RTD BANK MANAGER',
       'LAWYER', 'SR. B.MGR.', 'DY DIRECTOR', 'CHIEF ARCHITECT', 'C
E',
       'PROPERIETOR', 'AREA SALES MANAGER', 'COORDINATOR',
       'ADDITIONAL DIRECTOR', 'HOD', 'TECHNICAL ARCHITECT',
       'ARMY OFFICER', 'ASSTT PROFESSOR', 'HEADMISTRESS', 'CAPT',
       'TECHNICAL LEAD', 'COMMANDER', 'SR PROJECT MANAGER', 'SEC',
       'MED PRACTITIONER', 'SR MGR COMMERCIAL', 'S D O', 'SELF',
       'RETRD DIR NSO', 'A P', 'SVP', 'ASST PROFESSOR', 'JR ENGR',
       'A/C EXEC', 'SR LECTURER', 'JTO', 'ADL COMMISSIONER', 'SPM',
       'C E O', 'MANAGER S.I.B.', 'EXECUTIVE OFFICER', 'BANK
EMPLOYED',
       'RETD EXEC', 'COST ACCTT', 'F C A', 'NEUROSURGEON',
       'SR DIV MED OFFR', 'PROJECT DIRECTOR', 'PROCESS ENGINEER',
       'EXECUTIVE DIRECTOR', 'PROMOTER', 'SCIENTIST/ENGR', 'DY S E
(E)',
       'OPERATIONS ENGINEER', 'SELF-EMPLOYED', 'CHAURMAN', 'SR
SCIENTIST',
       'CVO', 'SALES EXEC', 'BUSINESS MAN', 'PROJECT MANAGER', 'P S
R',
       'JR.EXECUTIVE', 'TRP', 'HOUSE-WIFE', 'SHIFT ENGR', 'ASST
MEDICAL',
       'ASST AUDIT OFFICER', 'ASSISTANT MANAGER', 'BRANCH HEAD',
       'MEDICAL REP', 'ACCT OFFICER', 'CLOTH MERCHANT',
       'ASST ADMN OFFICER', 'DOCTOR (VETNARY)', 'REVENU OFFICER',
       'HEAD DEPT OF COMMERCE', 'SQM', 'GLOBAL HEAD', 'JAO RETIRED',
       'MEMBER (ORD FYS)', 'MANAGER ACCOUNTS', 'PROG MANAGER',
       'SR TKT COLLECTOR', 'PROP.', 'RETD OFFICER', 'SYS DESIGNER',
       'A G M', 'LECT CARDIOLOGY', 'S T S', 'DY TAHSILDAR', 'PROD
F/C',
       'S.S.ENGR.', 'CO', 'HON SECRETARY', 'ENGGR', 'RETD ARMY', 'DY G
M',
       'SR. PROJECT MANAGER', 'DEPUTY GEN MANAGER', 'SOCIAL WORKER',
       'READER DEPT OF ENG', 'INVESTOR', 'CO SECRETARY',
       'TAX RECOVERY OFFICER', 'A/F', 'ASST GEN MGR', 'REGISTRAR',
       'STENO', 'REGIONAL MANAGER', 'QA ANALYST', 'SR PROJ MANAGER',
       'RETD BANK MANAGER', 'RETD CME (NFRLY)', 'DTO', 'ADL DIRECTOR',
       'ITM', 'STENOGRAPHER', 'AGRICULTURIST', 'LAB TECHNICIAN',
       'FOUNDER', 'ADMIN', 'C T O', 'SURVEYOR ENGR', 'MEDIA PERSON',
        'HEAD OPRN & BD', 'SR ARTIST', 'ACCOUNTS OFFICER', 'J D',
        'HEAD OF HR & FIN', 'GYNAECOLOGIST', 'S/W CONSULTANT',
        'PRACTIONER', 'SDE', 'ICU-IN-CHARGE', 'ACCOUNT', 'CHIEF
EXECUTIVE',
        'PROPRIETOR/MANAGER', 'CSO', 'BUSIENSS', 'OFFICE', 'S R O',
        'ASTT PROFESSOR', 'CHIEF MANAGER', 'V P RETAIL', 'SR
PROFESSOR',
        'FARMER', 'ASST MANAGER', 'LIC AGENT', 'TRUSTEE', 'CIVIL ENGG',
        'TEAM LEADER', 'HEALTH INSPECTOR', 'ASSO MANAGER', 'ASP',
        'ASST (STENO)', 'BROCKER', 'ASST V P', 'F O', 'CAPTAIN',
        'SR PROJ ASTT', 'INCHARGE', 'ART DIRECTOR', 'ASTT SURGEON',
        'EDUCATOR', 'DIVISIONAL ENGINEER', 'JR ASSTT', 'I O', 'CIO',
        'SECRETARY GEN', 'DIRECTOR-FIN', 'GOVT SERVANT', 'IAS OFFICER',
        'SR SALES ENGG', 'RIGISTRAR', 'MERCHANT', 'HOME MAKER', 'JGM',
        'DPA', 'SEEREFARY MINISTER', 'PROG MGR', 'SEC MANAGING BOARD',
        'EXE PRESIDENT', 'PGT', 'CHIEF ENGINEER', 'ACCOUNT OFFICER',
        'ASTT VP', 'FIN CONTROLLER', 'CITY HEAD', 'PHARMACIST',
        'ACCOUNTANT', 'ZONAL MANAGER', 'ARTICLE ASTT', 'M.D', 'HEAD-
LEGAL',
        'STR ENG', 'IFA', 'SELF EMLPYD', 'TRAINER', 'BUSINES',
        'DY SUPTD R G KAR MEDICAL', 'LDC', 'ACCOUNTS', 'SR ASST',
'JMD',
        'CREDIT MANAGER', 'C M O', 'JT PRESIDENT', 'ENG', 'ASTT
DESIGNER',
        'T L', 'POLICE', 'MANEGER', 'GENERAL MANAGER', 'RETD AEO',
'\\',
        "'", 'QA', 'ASSOCIATE', 'ASST PROF', 'COMPANY', 'SR PRINCIPAL',
        'SALES', 'SENIOR MANAGER', 'JUDGE', 'SENIOR MAANGER',
        'COMMANDING OFFICER', 'FREELANCE'], dtype=object)
import pandas as pd
# Example: assuming your DataFrame is called `new`
# Ensure all designations are in uppercase and stripped of extra
whitespace
new['DESIGNATION'] =
new['DESIGNATION'].astype(str).str.strip().str.upper()
# Mapping dictionary for grouping
designation_map = {
    # Business Owners
    'OWNER': 'Business Owner', 'OWNDER': 'Business Owner',
'PROPRIETOR': 'Business Owner',
    'PARTNER': 'Business Owner', 'PROP': 'Business Owner', 'MANAGING
PARTNER': 'Business Owner',
    'BUSINESS': 'Business Owner', 'SELF EMPLOYED': 'Business Owner',
'SELF-EMPLOYED': 'Business Owner',
    'BUSINESS MAN': 'Business Owner', 'BUSINESSMAN': 'Business Owner',
'PROMOTER': 'Business Owner',
    # Doctors & Medical
    'DOCTOR': 'Doctor/Medical', 'PHYSICIAN': 'Doctor/Medical',
'SURGEON': 'Doctor/Medical',
    'ENT SURGEON': 'Doctor/Medical', 'GYNECLOGIST': 'Doctor/Medical',
'GYNAECOLOGIST': 'Doctor/Medical',
    'CARDIOLOGIST': 'Doctor/Medical', 'NEUROLOGIST': 'Doctor/Medical',
'ORTHODONTIST': 'Doctor/Medical',
    'NEUROSURGEON': 'Doctor/Medical', 'MEDICAL OFFICER':
'Doctor/Medical', 'SR DOCTOR': 'Doctor/Medical',
    'CONSULTANT (PAED)': 'Doctor/Medical', 'MED PRACTITIONER':
'Doctor/Medical',
    'VAS DOCTOR': 'Doctor/Medical', 'DERMATOLOGIST': 'Doctor/Medical',
    # Engineers & Technical
    'ENGINEER': 'Engineer', 'CIVIL ENGG': 'Engineer', 'ELECTRIC
ENGINEER': 'Engineer',
    'SOFTWARE ENGINEER': 'Engineer', 'PROCESS ENGINEER': 'Engineer',
'EXECUTIVE ENGINEER': 'Engineer',
    'TECH ARCHITECT': 'Engineer', 'TECHNICAL ARCHITECT': 'Engineer',
'TECHNICAL LEAD': 'Engineer',
    'OPERATIONS ENGINEER': 'Engineer', 'S/W ENGINEER': 'Engineer', 'SR
ENGINEER SALES': 'Engineer',
    # Professors & Academics
    'PROF': 'Academia', 'PROFESSOR': 'Academia', 'LECTURER':
'Academia',
    'ASSISTANT PROFESSOR': 'Academia', 'ASSOCIATE PROFESSOR':
'Academia',
    'SR PROFESSOR': 'Academia', 'DEAN': 'Academia', 'TEACHER':
'Academia',
    'EDUCATOR': 'Academia', 'PRINCIPAL': 'Academia', 'HEAD MASTER':
'Academia',
   # Students
   'STUDENT': 'Student',
    # Consultants & Advisors
    'CONSULTANT': 'Consultant', 'ADVISOR': 'Consultant', 'SR
CONSULTANT': 'Consultant',
    'SENIOR CONSULTANT': 'Consultant', 'TECH CONSULTANT':
'Consultant',
    # Managerial & Executive
    'MANAGER': 'Manager/Exec', 'SR MANAGER': 'Manager/Exec', 'GENERAL
MANAGER': 'Manager/Exec',
    'GM': 'Manager/Exec', 'G M': 'Manager/Exec', 'VICE PRESIDENT':
'Manager/Exec',
    'V P': 'Manager/Exec', 'VP': 'Manager/Exec', 'AVP':
'Manager/Exec',
    'CEO': 'Manager/Exec', 'C E O': 'Manager/Exec', 'COO':
'Manager/Exec', 'MD': 'Manager/Exec',
    'M D': 'Manager/Exec', 'DIRECTOR': 'Manager/Exec', 'EXECUTIVE':
'Manager/Exec',
    'PROJECT MANAGER': 'Manager/Exec', 'SR PROJECT MANAGER':
'Manager/Exec',
    # Legal & Law
    'ADVOCATE': 'Law', 'LAWYER': 'Law', 'SR ADVOCATE': 'Law',
    # Media & Journalism
    'JOURNALIST': 'Media', 'EDITOR': 'Media', 'NEWS EDITOR': 'Media',
    'EXECUTIVE EDITOR': 'Media', 'CHIEF EDITOR': 'Media',
    # Government Services
    'IAS': 'Government', 'IPS': 'Government', 'POLITICIAN':
'Government',
    'OFFICER': 'Government', 'RETD OFFICER': 'Government', 'RETD
COLONEL': 'Government',
    'RETD': 'Government', 'RETIRED': 'Government', 'PENSIONER':
'Government',
    # Homemaker
    'HOUSEWIFE': 'Homemaker', 'HOUSE WIFE': 'Homemaker', 'HOME MAKER':
'Homemaker',
    # Finance & Accounting
    'CA': 'Finance', 'CFO': 'Finance', 'FINANCIAL PLANNER': 'Finance',
    'ACCOUNTANT': 'Finance', 'ACCOUNTS OFFICER': 'Finance', 'AUDITOR':
'Finance',
    # Others
    'UNKNOWN': 'Unknown', 'NO': 'Unknown'
}
# Apply the mapping
new['DESIGNATION_GROUPED'] = new['DESIGNATION'].map(lambda x:
designation_map.get(x.strip(), 'Other'))
# Check how many in each group
print(new['DESIGNATION_GROUPED'].value_counts())
Unknown          14465
Other              728
Manager/Exec       430
Business Owner     253
Government         156
Doctor/Medical     123
Academia            99
Consultant          62
Law                 32
Finance              31
Engineer             28
Student              20
Homemaker            13
Media                10
Name: DESIGNATION_GROUPED, dtype: int64
print(new['CITY_CLEAN'].unique().tolist())
['SURAT', 'KOZHIKODE', 'HASSAN DT', 'BANGALORE', 'NEW MUMBAI', 'PATNA
DT', 'MOGA', 'CALCUTTA', 'SAGAR', 'NOIDA', 'JALGAON DT', 'THANJAVUR',
'HYDERABAD', 'NEW DELHI', 'SOLAN', 'MUMBAI', 'INDORE', 'GHAZIABAD DT',
'PONDICHERRY', 'PANIPAT', 'SURAT DT', 'COIMBATORE', 'KANPUR',
'RAJAMUNDRY', 'TRICHY', 'VISAKHAPATNAM', 'THANE (W)', 'DEHRADUN',
'ANANTAPUR', 'HOOGHLY DT', 'PATNA', 'NADIA DT', 'PUNE', 'LUDHIANA DT',
'THANE', 'KOCHI', 'PRAKASAM DT', 'ALLAHABAD', 'RAJKOT', 'JAIPUR',
'THANE DT', 'NASIK', 'JAMMU TAWI', 'GOA', 'NAGPUR', 'GUNTUR', 'BELGAUM
DT', 'AHMEDABAD', 'LUCKNOW', 'AMBALA', 'RANCHI', 'FEROZPUR',
'PIRAVOM', 'JODHPUR', 'PUNE DT', 'KOLAR DT', 'SRINAGAR', 'GURGAON',
'IMPHAL', 'KOLHAPUR', 'GURGAON DT', 'GWALIOR', 'VALSAD DT',
'PILIBHIT', 'VADODARA', 'CALICUT', 'PANCHKULA', 'BHUBNESHWAR',
'RISHIKESH', 'DEHRADUN DT', 'MANGALORE', 'MALAPPURAM DT', 'GUWAHATI',
'GANDHINAGAR', 'UDAIPUR', 'JHAJJAR DT', 'DELHI', 'RATNAGIRI DT',
'CHENNAI', 'VELLORE', 'SOLAN DT', 'MOHALI', 'RAJSAMAND DT', 'BULDANA
DT', 'CHANDIGARH', 'BHOPAL', 'JAGDALPUR', 'PALGHAT', 'R R DT', 'NASIK
DT', 'PANAJI', 'KOLHAPUR DT', 'SECUNDERABAD', 'RANCHI DT', 'BARODA',
'HUBLI', 'DURG', 'TIRUVALLA', 'TRIVANDRUM', 'VARANASI', 'SATARA',
'LUDHIANA', 'FARIDKOT DT', 'ALAPPUZHA DT', 'THALLASSERY', 'SALEM',
'ROHTAK', 'JODHPUR DT', 'KANGRA DT', 'ABOHAR', 'DHULE', 'CHINDWARA',
'CHHINDWARA', 'CHHINDWARA DT', 'SILIGURI', 'KOTA', 'PATIALA DT',
'PATHANKOT', 'DURGAPUR', 'AJMER', 'TIRUPATI', 'UTTAR KANNADA DT',
'PURULIA DT', 'JAMMU', 'NAGAUR DT', 'ERODE', 'AMRITSAR', 'FARIDABAD',
'DAVANGERE', 'SHIMLA', 'KARUR', 'MEERUT', 'BAHADURGARH',
'CHITTORGARH', 'TIRUPUR', 'RAIPUR', 'KOTTAYAM DT', 'KANNUR', 'S KODAGU
DT', 'CHATTISHGARH', 'AKOLA', 'BIKANER DT', 'RAIGARH (MP) DT',
'THRISSUR DT', 'MUNGER', 'SOUTH COORG', 'JAMSHEDPUR', 'HOSHIARPUR',
'AMBALA CITY', 'GOA (N) DT', 'ANAND', 'MUKTSAR DT', 'AGRA', 'GREATER
NOIDA', 'DELHI CANTT', 'MYSORE', 'INODRE', 'NAINITAL DT', 'MADURAI',
'SANGLI DT', 'SAKLESPUR', 'SHIMOGA', 'COIMBATORE DT', 'ZIRAKPUR',
'KUTCHCH', 'CUTTACK', 'BURDWAN DT', 'GHAZIABAD', 'MUZAFFARNAGAR',
'NAVSARI', 'VILLUPURAM DT', 'BELGAUM', 'TIRUNELVELI DT', 'HOWRAH', 'D
K DIST', 'ANAND DT', 'JALANDHAR', 'UTTAR PRADESH', 'BASTAR', 'EAST
GODAVARI DT', 'JALPAIGURI DT', 'CUDDAPAH', 'TIRUNELVELI', 'BHARUCH
DT', 'GOPALGANJ DT', 'BRAMHAPUR', 'JABALPUR', 'MANSA', 'ALIGARH',
'NALGONDA', 'RAIGAD', 'NALBARI DT', 'AMBALA CANTT', 'JHARSUGUDA DT',
'PUDUKKOTTAI', 'FATEHGARH SAHIB DT', '24 PARGANAS (N)', 'NAMAKKAL
DISTRICT', 'MAYURBHANJ', 'BARIPADA', 'JAIPUR DT', 'AMROHA', 'SIRSA
DT', 'KURNOOL', 'BANGALORE (N)', 'THIRUVALLUR DT', 'MANIPUR',
'TANDUR', 'VIJAYAWADA', 'MURSHIDABAD DT', 'TRIPURA (W)', 'TINSUKIA',
'SUNDERGARH DT', 'DHARWAR', 'PALAKKAD', 'BIKANER', 'BAREILLY DT',
'DAUSA DT', 'PERAMBALUR DT', 'REWARI DT', 'KODAGU DT', 'UDUPI DT',
'TINSUKIA DT', 'MIRZAPUR DT', 'ALLEPPEY', 'CHIKMANGALUR', 'RAIGAD DT',
'ERODE DT', 'CUTTACK DT', 'HARIDWAR', 'ERNAKULAM DT', 'KANGRA',
'KOTHAGUDEM', 'KOTA DT', 'ROURKELA', 'ROORKEE', 'SIKAR', 'TUTICORIN
DT', 'TANJORE DT', 'SIROHI DT', 'TIRUVANNAMALAI', 'KOLLAM DT',
'KOTTAYAM', 'UJJAIN', 'PORBANDAR', 'UDIPI DT', 'MARGAON', 'DAMOH DT',
'KAITHAL', 'JALGOAN', 'BULANDSHAHAR', 'RAJOURI DT', 'RAJGARH',
'BAGALKOTE DT', 'D K DT', 'WARANGAL DT', 'AHMEDNAGAR DT', 'TIRUPPUR
DT', 'AHMEDNAGAR', 'AJMER DT', 'VIRUDHUNAGAR DT', 'INDORE DT',
'NARMADA DT', 'NAWANSHAHR', 'PHULBANI', 'BONGAIGAON', 'CHAMPAWAT DT',
'JHUNJHUNU DT', 'DAVANGERE DT', 'NANDED', 'NILGIRIS DT', 'CHARKHI
DADRI', 'JALGAON', 'HOWRAH DT', 'CHIRAWA', 'BELLARY DT', 'GANJAM DT',
'CALICUT DT', 'SIRSI', 'KOLLAM', 'BULDHANA', 'TRICHUR DT',
'FARRUKHABAD DT', 'PANCHMAHAL DT', 'SONITPUR DT', 'AURANGABAD',
'SATARA DT', 'KOZHIKODE DT', 'BELLARY', 'DURG DT', 'GUNA', 'MEDAK DT',
'BANGALORE DT', 'JHARSUGUDA', 'BATHINDA', 'SONIPAT', 'VIZIANAGARAM',
'JAMNAGAR DT', 'BARGARH', 'MAPUSA', 'GORAKHPUR', 'SONEBHADRA DT',
'THRISSUR', '24 PARGANAS (S) DT', 'JAMNAGAR', 'PERUNDURAI', 'OSMANABAD
DT', 'PHAGWARA', 'U K DT', 'SRIGANGANAGAR', 'SARGUJA DT', 'IDUKKI DT',
'PILANI', 'WAYANAD DT', 'GANDHINAGAR DT', 'SIRMAUR DT', 'SRIKAKULAM
DT', 'COCHIN', 'REWA', 'PANJIM', 'AIZAWAL', 'MUSSOORIE', 'MANJERI',
'BHIWANI DT', 'DAKSHIN KANNADA', 'AHMEDABAD DT', 'DAKSHINA KANNADA',
'KARELI', 'AMRAVATI', 'CHITTOOR DT', 'KULLU DT', 'UTTARKASHI', 'THE
NILGIRIS', 'KANNUR DT', 'KAPURTHALA', 'C/O 56 APO', 'SITAPUR',
'THALASSERY', 'ULHASNAGAR', 'KRISHNAGIRI', 'SOLAPUR DT', 'HOSUR',
'BAREILLY', 'SIRHIND', 'LATUR', 'DADAR & NAGAR HAVELI', 'SIWAN',
'ANGUL DT', 'KURSEONG', '24 PARGANAS (N) DT', 'SINDHUDURG DT',
'JALPAIGURI', 'VASCO-DA-GAMA', 'PANCHKULA DT', 'SOLAPUR', 'TENALI',
'DEORIA', 'RAMNAD DT', 'BHILAI', 'BHAVNAGAR', 'FARIDKOT', 'SHILLONG',
'AMRAVATI DT', 'NASHIK', 'AGARTALA', 'PATHANAMTHITTA DT', 'UNA DT',
'DIBRUGARH', 'JALANDHAR CITY', 'K K DT', 'ARARIA DT', 'MANDI DT',
'NAINITAL', 'HISSAR', 'KHAMMAM DT', 'BHARUCH', 'KURNOOL DT',
'ERNAKULAM', 'BURDWAN', 'BIJAPUR DT', 'KANCHEEPURAM DT', 'KOTTYAM',
'NELLORE DT', 'HOSHANGABAD DT', 'GAYA', 'BILASPUR DT (MP)', 'JHANSI',
'NAMAKKAL DT', 'AMBERNATH', 'NALANDA DT', 'PURULIA', 'DEWAS',
'KURUKSHETRA', 'PANNA', 'TIRUVARUR', 'SALEM DT', 'CHIKMAGALUR DT',
'PALANPUR', 'NAVSARI DT', 'NAGAON DT', 'FIROZABAD DT', 'TIRUPUR DT',
'TRICHUR', 'KRISHNA DT', 'MEHSANA DT', 'BULDHANA DT', 'WARDHA DT',
'TUMKUR DT', 'MIDNAPUR DT', 'WARANGAL', 'DHANBAD', 'PATIALA',
'LAKHIMPUR DT', 'AMROHA DT', 'PALI DT', 'TIRUCHIRAPALLI', 'ALMORA',
'HOSPET', 'CHENGALPATTU', 'BALASORE', 'DHAR', 'VARODARA', 'HAPUR',
'MALLAPURAM DT', 'LATUR DT', 'CHIDAMBARAM', 'MEDCHAL', 'AKOLA DT',
'KOVILPATTI', 'DHARWAD', 'NEW AURANGABAD', 'YAMUNANAGAR', 'KUTCH',
'DHULIA', 'PALAKKAD DT', 'VELLORE DT', 'ALWAR', 'VAISHALI DT',
'PUTTUR', 'RAIGADH DT', 'KULITHALIA', 'BANSWARA DT', 'POLLACHI',
'CHIKMANGALUR DT', 'MYSORE DT', 'CUDDALORE DT', 'KATNI', 'PALI-
MARWAR', 'BAGALKOT', 'ALMORA DT', 'GULBARGA DT', 'MALDA DT',
'GULBARGA', 'HOSHIARPUR DT', 'SIKAR DT', 'PALGHAR DT', 'UDUPI',
'ADILABAD DT', 'ALWAR DT', 'CHITRADURGA', 'DHARWAD DT', 'SATNA DT',
'MOHALI DT', 'JAUNPUR DT', 'PALAI', 'BANKURA DT', 'SIVAGANGAI DT',
'GORAKHPUR DT', 'WEST GODAVARI', 'MARGOA', 'KASARAGOD', 'NIZAMABAD
DT', 'BIRBHUM DT', 'D K', 'KHURJA', 'CHENGANNUR', 'VARANASI DT',
'RAMPUR DT', 'SHIMOGA DT', 'SIWAN DT', 'RAYAGADA DT', 'MANDSAUR',
'SONARPUR', 'UDHAM SINGH NAGAR DT', 'HASSAN', 'SWITZERLAND', 'HARAYANA
DT', 'KAKINADA', 'AHMADNAGAR DT', 'E G DT', 'TRIVELLORE', 'VIJAPUR',
'MAVELIKARA', 'PATHANAMTHITA DT', 'BARMER DT', 'PRAKASAM', 'MANIPAL',
'RAMGARH DT', 'THANJAVUR DT', 'SONIPAT DT', 'PALLURUTH', 'DHARMAPURI
DT', 'AMRELI', 'SIVAGANGA DT', 'CHAMBA DT', 'BHAGALPUR DT',
'CHICKMAGALUR', 'NAMAKKAL', 'MAHENDERGARH DT', 'LUCKNOW CANTT', 'WEST
GODAVARI DT', 'KATHUA DT', 'KODAGU', 'TRISSUR', 'SIVSAGAR',
'AUSTRALIA', 'DIBRUGARH DT', 'ALUVA', 'ALLEPPEY DT', 'BHATINDA',
'THODUPUZHA', 'NANJANGUD', 'KORAPUT DT', 'KALAMASSERY', 'RAJOURI',
'BANASKANTHA DT', 'GURDASPUR DT', 'BHIND DT', 'PONDA', 'NAGERCOIL',
'24 PARGANAS DT', 'SANGRUR DT', 'KOLKATA', 'SINGRAULI DT', 'PHILLAUR',
'SINDHUDURGA DT', 'SIANG (E)', 'TRIVENDRUM', 'MUKATSAR DT',
'KRISHNAGIRI DT', 'JALNA', 'KASARAGOD DT', 'USA', 'MUZAFFARPUR DT',
'RAJAPALAYAM', 'SINGHBHUM (E)', 'PAURI GARHWAL DT', 'TRICHY DT',
'KAMRUP DT', 'GAYA DT', 'SIROHI', 'CHICKAMAGLUR DT', 'NAGAUR', 'KUTCH
DT', 'ROHTAK DT', 'GANGTOK', 'KOPPAL DT', 'BARODA DT', 'VISNAGAR',
'YERCAUD', 'BEED DT', 'KODERMA', 'PURNIA', 'KHARAGPUR', 'NAGPUR DT',
'KARAIKUDI', 'BOKARO STEEL CITY', 'TAPI DT', 'C/O 99 APO', 'KARBI
ANGLONG DT', 'CHANDRAPUR DT', 'RAIGAD (CHHATTISGARH)', 'MARGAO',
'KARNAL', 'RAIPUR DT', 'ARIYALUR DT', 'DARJEELING', 'SAMBALPUR',
'CHITRADURGA DT', 'BHILWARA DT', 'VIRUDHUNAGAR', 'HINGOLI DT', 'S
COORG', 'VISAKHAPATNAM DT', '24 PARGANAS', 'SULTANPUR DT', 'MURADABAD
DT', 'JORHAT DT', 'NC HILLS DT', 'MANDYA DT', 'THIRUVANANTHAPURAM',
'BIJAPUR', 'CHIKAMAGALUR DT', 'FATEHPUR DT', 'ERNAKULUM DT',
'MEHSANA', 'JORHAT', 'PALLADAM', 'KAPURTHALA DT', 'THE NILGIRS DT',
'PALLAKAD', 'KOTHAMANGALAM', 'MODI NAGAR', 'GARHWA', 'KATHWA', 'JAMMU
& KASHMIR', 'PANCHMARHI', 'SAMBALPUR DT', 'LAKHIMPUR KHERI',
'THIRUVAMBADY', 'BOLANGIR', 'DHENKANAL DT', 'DHENKANAL', 'SHRAWASTI',
'MUZAFFARPUR', 'JIND', 'KADAPA DT', 'SABARKANTHA DT', 'MALAPPURAM',
'PAPUMPARE', 'BHAGALPUR', 'RAJKOT DT', 'NASIK RD', 'SIKKIM', 'KHURDA
DT', 'MUZAFFARNAGAR DT', 'PARBHANI DT', 'PAYYANUR', 'BIJNOR',
'PANRUTI', 'SRI GANGANAGAR DT', 'DIGBOI', 'KANCHIPURAM DT', 'SRI
MADHOPUR', 'VERAVAL', 'FARIDABAD DT', 'SITAMARHI DT', 'PALGHAR',
'TRIPURA', 'BHANDARA DT', 'NELLORE', 'BONGAIGAON DT', 'HOSIARPUR',
'KUSHINAGAR DT', 'TIRUCHIRAPALLI DT', 'MORENA', 'ROPAR DT', 'KASHMIR',
'KANPUR DT', 'MAHARAJGANJ', 'PURI DT', 'BHOJPUR DT', 'TRICHIRAPALLI',
'TIRUCHENGODE', 'UDUDI', 'SATNA', 'KANDHAMAL DT', 'DAKSHIN DINAJPUR',
'AZAMGARH', 'SIDHI DT', 'MOTIHARI', 'NIZAMABAD', 'MAJITAR', 'BOLANGIR
DT', 'KOKRAJHAR DT', 'JEHANABAD DT', 'SHEOPUR DT', 'ETAH', 'JAMMU DT',
'GONDIA', 'DHULE DT', 'VISHAKAPATNAM', 'JUNAGARH', 'UTTARA KANNADA
DT', 'HAZARIBAG DT', 'RAJPIPLA', 'SEHORE', 'ODISHA', 'SAMBHAL DT',
'BENGALURU', 'MORBI', 'WASHIM DT', 'KOTTAKKAL', 'CHANGANACHERRY',
'THENI DT', 'TIRUCHIRAPPALLI', 'GURUGRAM', 'CHANDAULI', 'BALIA',
'KOLLAKADAVU', 'BALAGHAT', 'BUXAR', 'MALKAPUR', 'VIZIANAGARAM DT',
'FARRUKHABAD', 'COOCH BEHAR', 'MAHENDERGARH', 'COOCH BEHAR DT',
'BILASPUR DT', 'AURANGABAD DT', 'BHANDARA', 'RAJAHMUNDRY', 'DULIAJAN',
'PARUMALA', 'POOVARANI', 'KANCHIPURAM', 'PALWAL', 'RAMGARH CANTT', 'J
P NAGAR DT', 'TRIPUNITHURA', 'SAHARANPUR', 'RATNAGIRI', 'JOSHIMATH',
'UTTARA KANNADA', 'CHANDRAPUR', 'DEOGHAR DT', 'MATHURA', 'DEOGHAR',
'BHUBANESHWAR', 'VALSAD', 'KARIM NAGAR DT', 'KAIRA DT', 'MURADABAD',
'DHANBAD DT', 'SILCHAR', 'NAWADAH', 'BETTIAH', 'BHUBANESWAR',
'TRIVANDRUM DT', 'KINNAUR DT', 'JAUNPUR', 'PUDUCHERRY', 'PARBHANI',
'HAMIRPUR', 'KANYAKUMARI', 'BAHRAICH', 'REWARI', 'SINGHRAULI DT',
'SANGLI', 'NARASARAOPET', 'JHABUA', 'EAST GODAVARI', 'U S A',
'LAKHISARA', 'SAWAI MADHOPUR DT', 'MANAGALORE', 'KERALA', 'CHURU DT',
'DARRANG', 'HALDWANI', 'SURATE DT', 'UNITED KINGDOM', 'PORT BLAIR',
'BARAPANI', 'CHAMPARAN (W)', 'RI-BHOI DT', 'BEHRAMPORE', 'LOHIT DT',
'WASHIM', 'ANGUL', 'RAJNANDGAON DT', 'GONDA', 'TONK DT', 'YANAM',
'DARBHANGA', 'CHITTORGARH DT', 'SAMBA', 'ALAPPUZHA', 'RAIBARELI',
'BALAGHAT DT', 'NAWANSHAHR DT', 'SINGHBHUM (W)', 'KORIYA DT',
'CHITRADURG DT', 'AGRA DT', 'YAVATMAL DT', 'CHITTOOR', 'KATIHAR DT',
'BASTI DT', 'MAU DT', 'BAGALKOT DT', 'VIJAYWADA', 'AMBALA DT',
'NADIAD', 'DIMAPUR', 'SIRMOUR DT', 'TUTICORIN', 'WARDHA', 'KATHUA',
'ITARSI', 'BOKARO', 'MHOW CANTT', 'UDHAMPUR', 'SHAHDOL DT', 'RAICHUR
DT', 'JHANSI DT', 'PALAKAD', 'TIRPATI DT', 'JALANDHAR DT', 'TIRUR',
'BIHAR SHARIF', 'KUMBAKONAM', 'DUVANAGERE', 'DEVGAD', 'DARRANG DT',
'SURATGARH', 'UDHAM SINGH NAGAR', 'PACHMARHI', 'SAHARANPUR DT',
'FIROZPUR CITY', 'GOLAGHAT DT', 'CANNANORE', 'CHICKBALLAPUR', 'GUNTUR
DT', 'DARJEELING DT', 'JALNA DT', 'GONDA DT', 'TARN TARAN', 'ARWAL
DT', 'SIVSAGAR DT', 'LALITPUR', 'KEONJHAR DT', 'DINDIGUL',
'SINGAPORE', 'KASHIPUR DT', 'TUMKUR', 'CANNANORE DT', 'SRIGANGANAGAR
DT', 'DIU', 'KALIMPONG', 'SILVASSA', 'HAVERI DT', 'NAGALAND',
'HOOGHLY', 'DABHADE', 'JHUNJHUNU', 'CHINDWARA DT', 'AMBEDKAR NAGAR
DT', 'AMBEDKAR NAGAR', 'MIDNAPUR', 'SIKKIM (S)', 'CHHATARPUR',
'BILASPUR (HP)', 'KANNAUJ', 'DAMAN', 'BHILWARA', 'KHURDH', 'GUNA DT',
'HONGKONG', 'SHAHIBABAD', 'TRISSUR DT', 'CHITTORGERH', 'C/O56 APO',
'MALAPURAM', 'BHIWANI', 'JHARKHAND', 'SANGAMNER', 'MIDNAPUR DT (W)',
'UDIPI', 'SIRSA', 'MANDI', 'RALEIGH', 'PALGHAT DT', 'UNITED STATES',
'KURUKSHETRA DT', 'FEROZPUR DT', 'PALAMU', 'DAMOH', 'MANDLA', 'WEST
KHASI HILLS DT', 'BAGHPAT', 'BANDA DT', 'SHIKOHABAD', 'RATLAM',
'ORAI', 'KALYAN (W)', 'SIKKIM (E)', 'HANUMANGARH DT', 'MEERUT DT',
'ANKLESHWAR', 'KHANDWA', 'NARSINGHPUR DT', 'SIKKIM (W)', 'RUDRAPRAYAG
DT', 'VAPI (W)', 'JUNAGADH DT', 'REWA DT', 'ASANSOL', 'KOLAR GOLD
FIELDS', '24 PARGANAS (S)', 'UDHAMPUR DT', 'KATNI DT', 'ANANTAPUR DT',
'KURUD', 'BURHANPUR', 'CHAMOLI DT', 'SHUJALPUR MANDI', 'SHEIKHPURA',
'FATEHABAD', 'DHOLPUR', 'CUDDALORE', 'PALAMU DT', 'PULWAMA', 'BADAUN',
'SWEDEN', 'PATNA CITY', 'HINDUPUR', 'QATAR', 'VADAKARA', 'DHULIA DT',
'KASGANJ', 'TIRUVANNAMALAI DT', 'DUMKA DT', 'BILASPUR (MP)', 'UJJAIN
DT', 'BUNDI', 'MUGHALSARAI', 'SIDDIPET', 'SULLURUPETA', 'RAYAGAD',
'SONEPUR DT', 'MACHILIPATNAM', 'KHAMMAM', 'AURANGABAD (B)',
'ADILABAD', 'KAUSAMBI DT', 'RAICHUR', 'DHAR DT', 'KARIM NAGAR',
'TRICIRAPALLI', 'BARDHAWAN', 'CANADA', 'KHAMGAON', 'NALGONDA DT',
'NEYVELI', 'TRIPURA (S) DT', 'WALAJAPET', 'NUAPADA DT', 'NABARANGPUR
DT', 'UDAIPUR DT', 'KASARGOD DT', 'TALCHER', 'RAIGARH (MP)',
'AURANGABAD (BIH)', 'NAGAON', 'MORIGAON', 'HATHRAS', 'PATAN',
'BHAVNAGAR DT', 'KANYAKUMARI DT', 'VILLUPURAM', 'KASARGOD', 'HAMIRPUR
DT', 'KRISHNA', 'PATHANMATHITTA DT', 'KASARAGODE', 'BIDAR DT',
'DAUSA', 'SANGAREDDY', 'NANDURBAR DT', 'LATEHAR', 'UDUMALPET', 'GIRIDH
DT', 'SRIKAKULAM', 'BHADRAK', 'ROPAR', 'HOSUR DT', 'TIRUPATI DT',
'BUXAR DT', 'KARNAL DT', 'SIVAKASI', 'UDAIPUR (TRIPURA)', 'MEDAK',
'PATTUKKOTTAI', 'TIRUTTANI', 'ARCOT', 'N A A DT', 'KOZHENCHERRY',
'KASARGODE', 'PURI', 'SURENDRANAGAR', 'TADEPALLI', 'DAMTAL',
'BALROTRA', 'HAMIRPUR (HP)', 'GIRIDH', 'MAYURBHANJ DT', 'SURGUJA DT',
'JAJPUR DT', 'GHAZIPUR', 'CUDDAPAH DT', 'BARABANKI', 'SAHARSA DT',
'NEEMUCH DT', 'NEEMUCH', 'BALIA DT', 'RAMPUR', 'BHARATPUR',
'UTHUKULI', 'JHANSI CITY', 'UDUPD DT', 'UDUPI PO', 'PUNJAB',
'GUNTAKAL', 'NALANDA', 'TEZPUR', 'MAINPURI', 'BARAUT', 'GANGAVATHI
DT', 'MAHABUB NAGAR', 'BARAN DT', 'KORBA', 'SAMASTIPUR DT', 'RAJGARH
DT', 'RISHRA', 'MIDNAPUR DT (E)', 'CHOMU', 'HAZARIBAGH DT',
'NANDURBAR', 'KODAGUN', 'MIRZAPUR', 'SAGAR DT', 'BETUL', 'YAMUNANAGAR
DT', 'GHAZIPUR DT', 'SHAJAPUR DT', 'VRINDAVAN', 'BEAWAR', 'PALLAKKAD
DT', 'SAHIBGANJ', 'YAVATMAL', 'BHATINDA DT', 'HOSHANGABAD', 'PO/DT
ANGUL', 'FAIZABAD DT', 'MADIKERI', 'BARGARH DT', 'NARSINGHPUR',
'MANCHERIAL', 'MADHUBANI DT', 'LONAVALA', 'SHAHDOL', 'MANDYA',
'SONEBHADRA', 'S KODAGU', 'TEHRI GARHWAL', 'HISSAR DT', 'KORAPUT',
'GAUTAM BUDH NAGAR', 'SHAHJAHANPUR', 'GODAVARI', 'SHAHJAHANPUR DT',
'MAWANA', 'CHARU', 'NOWGONG', 'TRUCHIRAPALLY', 'THEVARA', 'BOLAGARH',
'HANSI', 'ASHOK NAGAR', 'GIDDALUR', 'BAGHPAT DT', 'NANDED DT',
'AMRITSAR DT', 'GODAVARI DT', 'GONDIA DT', 'AYODHYA', 'DUNGARPUR DT',
'JIND DT', 'SATTENAPALLI', 'MAHABUB NAGAR DT', 'GUDIVADA', 'CHATRAI',
'KISHANGANJ', 'MURABABAD', 'PATANCHERU', 'KORBA DT', 'CHAMOLI',
'SAMASTIPUR', 'FIROZPUR CANTT', 'JAKHAL', 'NARWANA', 'PITHORAGARH DT',
'EAST GARO HILLS', 'CANARA', 'U A E', 'DERABASSI', 'JAISALMER',
'NARNAUL', 'KEONJHAR', 'BEHRAMPUR', 'SITAPUR DT', 'FATEHABAD DT',
'LAKHIMPUR (N)', 'BAGESHWAR DT', 'AGRA CANTT', 'TOHANA', 'FAIZABAD',
'MALERKOTLA', 'MAHASAMUND DT', 'SIBSAGAR DT', 'MALDA', 'KAITHAL DT',
'BUDAUN DT', 'HIMACHAL PRADESH', 'TUNDLA', 'BHIWADI', 'KHERI DT',
'KHEDA DT', 'OSMANABAD', 'BASTAR DT', 'MUNNAR', 'SAMALKHA', 'NABHA',
'KALLAKURICHI', 'HARIDWAR DT', 'SIDDHARTHN DT', 'MANNAMPANDAL',
'UNNAO', 'MANSA DT', 'NAGAPATTINAM', 'ANANTHAPUR', 'RAMAMANGALAM',
'MUNNAR DT', 'RANIKHET', 'HARDOI', 'PITHORAGARH', 'MEHBUBNAGAR',
'ANANTAPOR DT', 'JALAUN', 'PILIBHIT DT', 'BUXER DT', 'BARABANKI DT',
'SEONI', 'KOREA DT', 'KHANDWHA DT', 'PERIYAR DT', 'BEHROR', 'BATALA',
'KOKHAJHAR', 'BIJNOR DT', 'BHADOHI', 'KHANDWA DT', 'NAYAGARH DT',
'NAYAGARH', 'MALAPURAM DT', 'NARASINGHPUR', 'SAHAPUR DT', 'ROHTAS DT',
'AURAIYA DT', 'MANDLA DT', 'IDUKKI', 'GERMANY', 'GODDA DT',
'PANITOLA', 'AIZAWAL DT', 'CHITTARANJAN', 'DEWAS DT', 'BUDAUN', 'UTTAR
DINAJPUR DT', 'UTTAR KANNADA', 'GAUTAM BUDH NAGAR DT', 'LAKHISARAI',
'RAIGARD DT', 'SHEGAON', 'RAJNANDGAON', 'THALASSERRY', 'BARGARHA',
'DAHOD DT', 'GURDASPUR', 'DEOLALI', 'AIZAWL', 'SONITPUR', 'SIDDHPUR',
'GARH', 'BELGIUM', 'JAGATSINGH DT', 'N PARUR', 'BONINAYAKANUR',
'PRADEEPPORT', 'FIROZABAD', 'HANGAL', 'BETUL DT', 'CHINGLEPUT',
'CHAMPARAN DT (E)', 'KARAULI', 'U S', 'JHALAWAR DT', 'TIKKAMGARH DT',
'KODERMA DT', 'BARAMULLA DT', 'VIZIANAGRARAM', 'AUROVILLE',
'MURSIDABAD', 'CHANDAULI DT', 'THALAYOLAPARAMBU', 'PRATAPGARH DT
(UP)', 'WAYANAD', 'SULLIA', 'BARPETA DT', 'UJIRE', 'VATAKARA', 'MUNGER
DT', 'SARAN', 'VAISHALI', 'GULBERGA', 'BEGUSARAI DT', 'QUILON DT',
'CHITRADUGRA', 'PARADIP', 'GADAG DT', 'GADAG', 'MURSHIDABAD', 'NORTH
ARCOT', 'BARWANI DT', 'BARNALA', 'FATEHGARHSAHIB DT', 'MOODBIDRI', 'N
COORG', 'MAUNATH BHANJAN', 'PUDDUSSERY', 'KADIRUR', 'FATEHGARH',
'BALANGIR', 'KAMRUP', 'BHATINDA CANTT', 'PURULIYA DT', 'KOHIMA',
'BARPETE', 'MAHABALIPURAM', 'SALCETTE', 'SHIVPURI DT', 'SARAN DT',
'SIRCILLA', 'THIRUVARUR DT', 'RANIBENNUR', 'KOZHIKKODE',
'NETHERLANDS', 'GUMLA', 'MADHYA PRADESH', 'POONCH DT', 'WELLINGTON',
'BAGASARA', 'ITANAGAR', 'THIRUVANANTHAPURAM DT', 'DIPHU', 'KARWAR',
'JAINTIA HILLS DT', 'TAWANG DT', 'KAYAMKULAM P O']
Note: further more cleaning of cities is done in power query ediotor of PowerBI
new.to_excel('final_outlook_cleaned.xlsx', index=False)
The dataset does not include any variable that reliably indicates
subscriber gender, hence gender-based analysis is not feasible i.e. "5.
Prepare gender ratio for each brand." cannot be answered