pca
April 23, 2025
       0.1    Use Iris dataset and perform PCA on the dataset. Examine the first two
              principal components of X.
       0.2    Create a scatter plot with each of the 150 rows of X projected onto the
              first two principal components. Horizontal axis should be first principal
              component and vertical axis should be second principal component.
[ ]: import pandas as pd
     df = pd.read_csv("iris.csv")
       # Prepare X and Y
       X = df.values[:,:-1]
       Y = df.values[:,-1]
       df.head()
[ ]:         sepal length   sepal width     petal length    petal width    species
       0              5.1           3.5              1.4            0.2          1
       1              4.9           3.0              1.4            0.2          1
       2              4.7           3.2              1.3            0.2          1
       3              4.6           3.1              1.5            0.2          1
       4              5.0           3.6              1.4            0.2          1
[ ]: df
[ ]:          sepal length    sepal width    petal length    petal width    species
       0               5.1            3.5             1.4            0.2          1
       1               4.9            3.0             1.4            0.2          1
       2               4.7            3.2             1.3            0.2          1
       3               4.6            3.1             1.5            0.2          1
       4               5.0            3.6             1.4            0.2          1
       ..              …            …             …            …      …
       145             6.7            3.0             5.2            2.3             3
       146             6.3            2.5             5.0            1.9             3
       147             6.5            3.0             5.2            2.0             3
       148             6.2            3.4             5.4            2.3             3
       149             5.9            3.0             5.1            1.8             3
                                                     1
     [150 rows x 5 columns]
[ ]: X_standard = X - X.mean()
     X_standard
[ ]: array([[   1.6355,    0.0355,   -2.0645,   -3.2645],
            [   1.4355,   -0.4645,   -2.0645,   -3.2645],
            [   1.2355,   -0.2645,   -2.1645,   -3.2645],
            [   1.1355,   -0.3645,   -1.9645,   -3.2645],
            [   1.5355,    0.1355,   -2.0645,   -3.2645],
            [   1.9355,    0.4355,   -1.7645,   -3.0645],
            [   1.1355,   -0.0645,   -2.0645,   -3.1645],
            [   1.5355,   -0.0645,   -1.9645,   -3.2645],
            [   0.9355,   -0.5645,   -2.0645,   -3.2645],
            [   1.4355,   -0.3645,   -1.9645,   -3.3645],
            [   1.9355,    0.2355,   -1.9645,   -3.2645],
            [   1.3355,   -0.0645,   -1.8645,   -3.2645],
            [   1.3355,   -0.4645,   -2.0645,   -3.3645],
            [   0.8355,   -0.4645,   -2.3645,   -3.3645],
            [   2.3355,    0.5355,   -2.2645,   -3.2645],
            [   2.2355,    0.9355,   -1.9645,   -3.0645],
            [   1.9355,    0.4355,   -2.1645,   -3.0645],
            [   1.6355,    0.0355,   -2.0645,   -3.1645],
            [   2.2355,    0.3355,   -1.7645,   -3.1645],
            [   1.6355,    0.3355,   -1.9645,   -3.1645],
            [   1.9355,   -0.0645,   -1.7645,   -3.2645],
            [   1.6355,    0.2355,   -1.9645,   -3.0645],
            [   1.1355,    0.1355,   -2.4645,   -3.2645],
            [   1.6355,   -0.1645,   -1.7645,   -2.9645],
            [   1.3355,   -0.0645,   -1.5645,   -3.2645],
            [   1.5355,   -0.4645,   -1.8645,   -3.2645],
            [   1.5355,   -0.0645,   -1.8645,   -3.0645],
            [   1.7355,    0.0355,   -1.9645,   -3.2645],
            [   1.7355,   -0.0645,   -2.0645,   -3.2645],
            [   1.2355,   -0.2645,   -1.8645,   -3.2645],
            [   1.3355,   -0.3645,   -1.8645,   -3.2645],
            [   1.9355,   -0.0645,   -1.9645,   -3.0645],
            [   1.7355,    0.6355,   -1.9645,   -3.3645],
            [   2.0355,    0.7355,   -2.0645,   -3.2645],
            [   1.4355,   -0.3645,   -1.9645,   -3.2645],
            [   1.5355,   -0.2645,   -2.2645,   -3.2645],
            [   2.0355,    0.0355,   -2.1645,   -3.2645],
            [   1.4355,    0.1355,   -2.0645,   -3.3645],
            [   0.9355,   -0.4645,   -2.1645,   -3.2645],
            [   1.6355,   -0.0645,   -1.9645,   -3.2645],
            [   1.5355,    0.0355,   -2.1645,   -3.1645],
            [   1.0355,   -1.1645,   -2.1645,   -3.1645],
                                                    2
[   0.9355,   -0.2645,   -2.1645,   -3.2645],
[   1.5355,    0.0355,   -1.8645,   -2.8645],
[   1.6355,    0.3355,   -1.5645,   -3.0645],
[   1.3355,   -0.4645,   -2.0645,   -3.1645],
[   1.6355,    0.3355,   -1.8645,   -3.2645],
[   1.1355,   -0.2645,   -2.0645,   -3.2645],
[   1.8355,    0.2355,   -1.9645,   -3.2645],
[   1.5355,   -0.1645,   -2.0645,   -3.2645],
[   3.5355,   -0.2645,    1.2355,   -2.0645],
[   2.9355,   -0.2645,    1.0355,   -1.9645],
[   3.4355,   -0.3645,    1.4355,   -1.9645],
[   2.0355,   -1.1645,    0.5355,   -2.1645],
[   3.0355,   -0.6645,    1.1355,   -1.9645],
[   2.2355,   -0.6645,    1.0355,   -2.1645],
[   2.8355,   -0.1645,    1.2355,   -1.8645],
[   1.4355,   -1.0645,   -0.1645,   -2.4645],
[   3.1355,   -0.5645,    1.1355,   -2.1645],
[   1.7355,   -0.7645,    0.4355,   -2.0645],
[   1.5355,   -1.4645,    0.0355,   -2.4645],
[   2.4355,   -0.4645,    0.7355,   -1.9645],
[   2.5355,   -1.2645,    0.5355,   -2.4645],
[   2.6355,   -0.5645,    1.2355,   -2.0645],
[   2.1355,   -0.5645,    0.1355,   -2.1645],
[   3.2355,   -0.3645,    0.9355,   -2.0645],
[   2.1355,   -0.4645,    1.0355,   -1.9645],
[   2.3355,   -0.7645,    0.6355,   -2.4645],
[   2.7355,   -1.2645,    1.0355,   -1.9645],
[   2.1355,   -0.9645,    0.4355,   -2.3645],
[   2.4355,   -0.2645,    1.3355,   -1.6645],
[   2.6355,   -0.6645,    0.5355,   -2.1645],
[   2.8355,   -0.9645,    1.4355,   -1.9645],
[   2.6355,   -0.6645,    1.2355,   -2.2645],
[   2.9355,   -0.5645,    0.8355,   -2.1645],
[   3.1355,   -0.4645,    0.9355,   -2.0645],
[   3.3355,   -0.6645,    1.3355,   -2.0645],
[   3.2355,   -0.4645,    1.5355,   -1.7645],
[   2.5355,   -0.5645,    1.0355,   -1.9645],
[   2.2355,   -0.8645,    0.0355,   -2.4645],
[   2.0355,   -1.0645,    0.3355,   -2.3645],
[   2.0355,   -1.0645,    0.2355,   -2.4645],
[   2.3355,   -0.7645,    0.4355,   -2.2645],
[   2.5355,   -0.7645,    1.6355,   -1.8645],
[   1.9355,   -0.4645,    1.0355,   -1.9645],
[   2.5355,   -0.0645,    1.0355,   -1.8645],
[   3.2355,   -0.3645,    1.2355,   -1.9645],
[   2.8355,   -1.1645,    0.9355,   -2.1645],
[   2.1355,   -0.4645,    0.6355,   -2.1645],
                                        3
[   2.0355,   -0.9645, 0.5355, -2.1645],
[   2.0355,   -0.8645, 0.9355, -2.2645],
[   2.6355,   -0.4645, 1.1355, -2.0645],
[   2.3355,   -0.8645, 0.5355, -2.2645],
[   1.5355,   -1.1645, -0.1645, -2.4645],
[   2.1355,   -0.7645, 0.7355, -2.1645],
[   2.2355,   -0.4645, 0.7355, -2.2645],
[   2.2355,   -0.5645, 0.7355, -2.1645],
[   2.7355,   -0.5645, 0.8355, -2.1645],
[   1.6355,   -0.9645, -0.4645, -2.3645],
[   2.2355,   -0.6645, 0.6355, -2.1645],
[   2.8355,   -0.1645, 2.5355, -0.9645],
[   2.3355,   -0.7645, 1.6355, -1.5645],
[   3.6355,   -0.4645, 2.4355, -1.3645],
[   2.8355,   -0.5645, 2.1355, -1.6645],
[   3.0355,   -0.4645, 2.3355, -1.2645],
[   4.1355,   -0.4645, 3.1355, -1.3645],
[   1.4355,   -0.9645, 1.0355, -1.7645],
[   3.8355,   -0.5645, 2.8355, -1.6645],
[   3.2355,   -0.9645, 2.3355, -1.6645],
[   3.7355,    0.1355, 2.6355, -0.9645],
[   3.0355,   -0.2645, 1.6355, -1.4645],
[   2.9355,   -0.7645, 1.8355, -1.5645],
[   3.3355,   -0.4645, 2.0355, -1.3645],
[   2.2355,   -0.9645, 1.5355, -1.4645],
[   2.3355,   -0.6645, 1.6355, -1.0645],
[   2.9355,   -0.2645, 1.8355, -1.1645],
[   3.0355,   -0.4645, 2.0355, -1.6645],
[   4.2355,    0.3355, 3.2355, -1.2645],
[   4.2355,   -0.8645, 3.4355, -1.1645],
[   2.5355,   -1.2645, 1.5355, -1.9645],
[   3.4355,   -0.2645, 2.2355, -1.1645],
[   2.1355,   -0.6645, 1.4355, -1.4645],
[   4.2355,   -0.6645, 3.2355, -1.4645],
[   2.8355,   -0.7645, 1.4355, -1.6645],
[   3.2355,   -0.1645, 2.2355, -1.3645],
[   3.7355,   -0.2645, 2.5355, -1.6645],
[   2.7355,   -0.6645, 1.3355, -1.6645],
[   2.6355,   -0.4645, 1.4355, -1.6645],
[   2.9355,   -0.6645, 2.1355, -1.3645],
[   3.7355,   -0.4645, 2.3355, -1.8645],
[   3.9355,   -0.6645, 2.6355, -1.5645],
[   4.4355,    0.3355, 2.9355, -1.4645],
[   2.9355,   -0.6645, 2.1355, -1.2645],
[   2.8355,   -0.6645, 1.6355, -1.9645],
[   2.6355,   -0.8645, 2.1355, -2.0645],
[   4.2355,   -0.4645, 2.6355, -1.1645],
                                    4
            [   2.8355,   -0.0645,        2.1355,     -1.0645],
            [   2.9355,   -0.3645,        2.0355,     -1.6645],
            [   2.5355,   -0.4645,        1.3355,     -1.6645],
            [   3.4355,   -0.3645,        1.9355,     -1.3645],
            [   3.2355,   -0.3645,        2.1355,     -1.0645],
            [   3.4355,   -0.3645,        1.6355,     -1.1645],
            [   2.3355,   -0.7645,        1.6355,     -1.5645],
            [   3.3355,   -0.2645,        2.4355,     -1.1645],
            [   3.2355,   -0.1645,        2.2355,     -0.9645],
            [   3.2355,   -0.4645,        1.7355,     -1.1645],
            [   2.8355,   -0.9645,        1.5355,     -1.5645],
            [   3.0355,   -0.4645,        1.7355,     -1.4645],
            [   2.7355,   -0.0645,        1.9355,     -1.1645],
            [   2.4355,   -0.4645,        1.6355,     -1.6645]])
[ ]: type(X_standard)
[ ]: numpy.ndarray
[ ]: Y_standard = Y
     Y_standard
[ ]: array([1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1., 1.,   1.,   1.,
            1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1., 1.,   1.,   1.,
            1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1., 1.,   1.,   2.,
            2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2., 2.,   2.,   2.,
            2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2., 2.,   2.,   2.,
            2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2., 2.,   3.,   3.,
            3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3., 3.,   3.,   3.,
            3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3., 3.,   3.,   3.,
            3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.])
[ ]: type(Y_standard)
[ ]: numpy.ndarray
[ ]: cov = np.cov(X.T)   # .T b/c numpy wants varibles along rows rather than down␣
      ↪columns?
     print("covariance matrix =\n",cov)
    covariance matrix =
     [[ 0.68569351 -0.042434    1.27431544 0.51627069]
     [-0.042434    0.18997942 -0.32965638 -0.12163937]
     [ 1.27431544 -0.32965638 3.11627785 1.2956094 ]
     [ 0.51627069 -0.12163937 1.2956094    0.58100626]]
                                                            5
[ ]: from numpy import linalg as LA
     lambdas, vs = LA.eig(cov)
[ ]: lambdas
[ ]: array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])
[ ]: vs
[ ]: array([[ 0.36138659, -0.65658877, -0.58202985, 0.31548719],
            [-0.08452251, -0.73016143, 0.59791083, -0.3197231 ],
            [ 0.85667061, 0.17337266, 0.07623608, -0.47983899],
            [ 0.3582892 , 0.07548102, 0.54583143, 0.75365743]])
[ ]: lambdas
[ ]: array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])
[ ]: #sort the eigenvalues in descending order
     sorted_index = np.argsort(lambdas)[::-1]
[ ]: sorted_index
[ ]: array([0, 1, 2, 3], dtype=int64)
[ ]: sorted_eigenvalue = lambdas[sorted_index]
[ ]: sorted_eigenvalue
[ ]: array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])
[ ]: sorted_eigenvectors = vs[:,sorted_index]
[ ]: sorted_eigenvectors
[ ]: array([[ 0.36138659, -0.65658877, -0.58202985, 0.31548719],
            [-0.08452251, -0.73016143, 0.59791083, -0.3197231 ],
            [ 0.85667061, 0.17337266, 0.07623608, -0.47983899],
            [ 0.3582892 , 0.07548102, 0.54583143, 0.75365743]])
[ ]: # select the first n eigenvectors, n is desired dimension
     # of our final reduced data.
     n_components = 2 #you can select any number of components.
     eigenvector_subset = sorted_eigenvectors[:,0:n_components]
[ ]: eigenvector_subset
                                            6
[ ]: array([[ 0.36138659, -0.65658877],
            [-0.08452251, -0.73016143],
            [ 0.85667061, 0.17337266],
            [ 0.3582892 , 0.07548102]])
[ ]: #Transform the data
     X_reduced = np.dot(eigenvector_subset.transpose(),X_standard.transpose()).
      ↪transpose()
[ ]: print("Information/Variance in PC1",(sorted_eigenvalue[0]/
      ↪(sorted_eigenvalue[0]+sorted_eigenvalue[1]))*100)
    Information/Variance in PC1 94.57223216899484
[ ]: X_reduced
[ ]: array([[   2.81823951,   -5.64634982],
            [   2.78822345,   -5.14995135],
            [   2.61337456,   -5.18200315],
            [   2.75702228,   -5.0086536 ],
            [   2.7736486 ,   -5.65370709],
            [   3.2215055 ,   -6.06828303],
            [   2.68182738,   -5.23749119],
            [   2.87622016,   -5.49033754],
            [   2.6159824 ,   -4.74864082],
            [   2.82960933,   -5.21317833],
            [   2.99541804,   -5.97202148],
            [   2.8896099 ,   -5.34168252],
            [   2.71625587,   -5.09184058],
            [   2.27856139,   -4.81555799],
            [   2.85761474,   -6.50571721],
            [   3.1163261 ,   -6.66501491],
            [   2.87883726,   -6.13763209],
            [   2.85406843,   -5.63880172],
            [   3.30254481,   -6.19979162],
            [   2.91437873,   -5.84051289],
            [   3.19210892,   -5.71829851],
            [   2.9586599 ,   -5.75994864],
            [   2.28642572,   -5.46042065],
            [   3.19963195,   -5.42566143],
            [   3.14661108,   -5.28967072],
            [   2.99569623,   -5.1809357 ],
            [   3.03354506,   -5.45790407],
            [   2.94004523,   -5.69467143],
            [   2.86283042,   -5.63899256],
            [   2.87037575,   -5.12999135],
            [   2.91496666,   -5.12263409],
                                              7
[   3.09243264,   -5.73787684],
[   2.8535028 ,   -6.1403164 ],
[   2.90362838,   -6.42009834],
[   2.86543825,   -5.20563023],
[   2.63612348,   -5.39631705],
[   2.87712708,   -5.9263226 ],
[   2.70168102,   -5.59559631],
[   2.52186309,   -4.83899423],
[   2.91235882,   -5.55599641],
[   2.73226271,   -5.59048011],
[   2.65299643,   -4.385992 ],
[   2.50495859,   -4.98502652],
[   3.09675065,   -5.51582401],
[   3.29287589,   -5.76361572],
[   2.78791371,   -5.07674437],
[   2.96421687,   -5.83072372],
[   2.66290296,   -5.09900701],
[   2.95927938,   -5.9063626 ],
[   2.79900535,   -5.43465866],
[   6.78719082,   -6.01211305],
[   6.43485366,   -5.64528622],
[   6.96666745,   -5.83121539],
[   5.68568285,   -4.49899357],
[   6.59046839,   -5.40154325],
[   6.14403422,   -4.90870571],
[   6.5974258 ,   -5.61042085],
[   4.75324246,   -4.32206162],
[   6.54649696,   -5.55531448],
[   5.49361973,   -4.60387067],
[   4.99452425,   -4.06098139],
[   6.01406369,   -5.22297134],
[   5.76734164,   -4.77691611],
[   6.48729964,   -5.20213472],
[   5.32843976,   -5.07209837],
[   6.43022591,   -5.79413207],
[   6.16264889,   -4.97398291],
[   5.73847013,   -4.99334181],
[   6.44709886,   -4.78380703],
[   5.54759211,   -4.7431182 ],
[   6.61864831,   -5.24233572],
[   5.86025355,   -5.25802755],
[   6.80054901,   -4.99916527],
[   6.42409406,   -5.14421478],
[   6.21721846,   -5.47600852],
[   6.40253951,   -5.65545705],
[   6.83438957,   -5.57139345],
[   7.06016729,   -5.59444802],
                                  8
[   6.31565578,   -5.16360228],
[   5.19678135,   -4.95869039],
[   5.43423864,   -4.62178045],
[   5.31274266,   -4.64666581],
[   5.63879384,   -5.01292014],
[   6.88239157,   -4.90599829],
[   6.09037158,   -4.84266516],
[   6.30922345,   -5.52113489],
[   6.72305602,   -5.73457217],
[   6.31746037,   -4.95491552],
[   5.74832281,   -5.05842818],
[   5.66877835,   -4.64502585],
[   5.96716542,   -4.65624103],
[   6.39318033,   -5.29248813],
[   5.73291316,   -4.92256673],
[   4.79783337,   -4.31470435],
[   5.85934663,   -4.82204248],
[   5.83429961,   -5.11429789],
[   5.87858078,   -5.03373365],
[   6.14494114,   -5.34469077],
[   4.59589527,   -4.57085921],
[   5.80136597,   -4.97805477],
[   8.03355786,   -5.31710347],
[   6.91760101,   -4.75203623],
[   8.11904115,   -5.67085573],
[   7.47389619,   -5.14722467],
[   7.85237105,   -5.28669163],
[   8.89940387,   -5.87778925],
[   6.02359738,   -4.13419385],
[   8.4349522 ,   -5.68245258],
[   7.82359395,   -5.08312107],
[   8.4191161 ,   -6.10974453],
[   7.16413929,   -5.56918098],
[   7.30576709,   -5.11131496],
[   7.66795693,   -5.54322816],
[   6.84852871,   -4.55013423],
[   7.08829336,   -4.78731186],
[   7.40682151,   -5.44620327],
[   7.45205419,   -5.36889584],
[   8.9894205 ,   -6.50269191],
[   9.29801055,   -5.58427555],
[   6.80315685,   -4.56580294],
[   7.93018305,   -5.70514859],
[   6.70136624,   -4.72086105],
[   9.00228517,   -5.78762668],
[   6.89113126,   -5.12255325],
[   7.77779564,   -5.66194318],
                                  9
            [   8.11645561,   -5.88785393],
            [   6.76087329,   -5.14724778],
            [   6.79349719,   -5.21028393],
            [   7.62597386,   -5.1172231 ],
            [   7.89036815,   -5.79159238],
            [   8.34403791,   -5.70222174],
            [   8.73303879,   -6.70111766],
            [   7.66180278,   -5.109675 ],
            [   6.94652637,   -5.18353917],
            [   7.28365994,   -4.8270509 ],
            [   8.57886506,   -6.01503825],
            [   7.64660845,   -5.46701678],
            [   7.40746328,   -5.3762531 ],
            [   6.67169147,   -5.16196231],
            [   7.60997628,   -5.69924045],
            [   7.81651984,   -5.51060386],
            [   7.42463293,   -5.73615604],
            [   6.91760101,   -4.75203623],
            [   8.06537851,   -5.60481518],
            [   7.92111132,   -5.63175077],
            [   7.44647493,   -5.51448488],
            [   7.02953175,   -4.95163559],
            [   7.26671085,   -5.40581143],
            [   7.40330675,   -5.44358054],
            [   6.89255399,   -5.04429164]])
[ ]: import matplotlib.pyplot as plt
     plt.xlabel('Principal Component - 1',fontsize=20)
     plt.ylabel('Principal Component - 2',fontsize=20)
     plt.scatter(X_reduced[:,0:1],X_reduced[:,-1],c=Y)
     plt.show()
                                               10
[ ]:
       11