0% found this document useful (0 votes)
4 views14 pages

Solutions 1742312993

Uploaded by

olegruchinsky
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views14 pages

Solutions 1742312993

Uploaded by

olegruchinsky
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

🔥🔥🔥SQL SOLUTIONS :-- 🔥🔥🔥

I am hiring a PySpark Data Engineer! If you can answer


the following questions :
--====================
CREATE TABLE ##DATA(EmpID INT, FName
VARCHAR (50), Department VARCHAR (50),
Salary INT)
INSERT INTO ##DATA VALUES
(1, 'Amit', 'IT', 60000),
(2, 'Priya', 'HR', 55000),
(3, 'Rahul', 'Finance', 75000),
(4, 'Sneha', 'IT', 80000),
(5, 'Karan', 'HR', 65000)
--1.Task: Display the schema and first 3
rows.
SELECT TOP 3 * FROM ##DATA ORDER BY 1

---2. Filter Employees Earning More than


70,000
SELECT * FROM ##DATA WHERE Salary>70000
--3. Calculate Average Salary per
Department

SELECT Department,AVG(Salary) FROM ##DATA


GROUP BY Department
--4. Find Employees whose Name Starts with
'A'

SELECT * FROM ##DATA WHERE


SUBSTRING(FName,1,1)='A'

--5. Count the Number of Employees per


Department

SELECT Department,COUNT(FName)CNT FROM


##DATA GROUP BY Department

--6. Add a New Column for Tax Deduction


(10% of Salary)

SELECT *, (Salary*10)/100 AS TAX FROM


##DATA

--7. Sort Employees by Salary in


Descending Order

SELECT * FROM ##DATA ORDER BY Salary


DESC,FName

--8. Get the Second Highest Salary


SELECT TOP 1 * FROM ##DATA WHERE
Salary<(SELECT MAX(SALARY) FROM ##DATA)
ORDER BY Salary DESC

--9. Get Employees Who are in the HR or IT


Department

SELECT * FROM ##DATA WHERE Department IN


('HR','IT')

--10. Find the Total Salary Paid by the


Company

SELECT *,SUM(SALARY)OVER(ORDER BY
EMPID)TOTALSALARY FROM ##DATA

--11. Read a CSV File of Cricket Players

CREATE TABLE ##players(Player VARCHAR


(50),Country VARCHAR (50) ,Runs INT
,Wickets INT)

INSERT INTO ##players VALUES


('Virat Kohli','India',12000,4),
('Rohit Sharma','India',11000,8),
('Jasprit Bumrah','India',1200,200),
('Steve Smith','Australia',9500,20)
--Task: Read this CSV file into a
DataFrame and display its contents.

--12. Find the Player with Maximum Runs

SELECT TOP 1 * FROM ##players ORDER BY


RUNS DESC
---ACCORDING TO DATAFRAME
SELECT MAX(PLAYER),MAX(RUNS) FROM
##players

--13. Find the Average Runs Scored by


Indian Players
SELECT *,AVG(RUNS)OVER(ORDER BY PLAYER)
FROM ##players WHERE COUNTRY='INDIA'
---Find the Average Runs Scored by Indian
Players
SELECT COUNTRY,AVG(RUNS) RUNS FROM
##PLAYERS WHERE COUNTRY='INDIA' GROUP BY
COUNTRY

--14. Get Players Who Have Taken More than


50 Wickets

SELECT * FROM ##PLAYERS WHERE Wickets>50


--15. Read a JSON File Containing Indian
Cities Population

CREATE TABLE ##cities(City VARCHAR


(50),State VARCHAR (50),Population INT)
INSERT INTO ##cities VALUES
('Mumbai','Maharashtra','20000000'),
('Delhi','Delhi','18000000'),
('Bangalore','Karnataka','12000000'),
('Hyderabad','Telangana','10000000')

--Task: Read this JSON file into a


DataFrame and display its contents.

SELECT * FROM ##cities

--16. Find Cities with a Population


Greater than 15 Million

SELECT * FROM ##cities WHERE


Population>15000000

--17. Calculate Total Population per State

SELECT
STATE,SUM(Population)TOTALPopulation FROM
##cities GROUP BY STATE
--18. Find the State with the Highest
Total Population

SELECT
MAX(STATE)STATE,MAX(Population)Population
FROM ##cities

--19. Convert a DataFrame to Pandas


3/18/25, 6:43 PM Pyspark1 - Databricks

Pyspark1 (Python)

Import notebook

from pyspark.sql.functions import col, avg, count, sum, max

# Sample Data
data = [
(1, "Amit", "IT", 60000),
(2, "Priya", "HR", 55000),
(3, "Rahul", "Finance", 75000),
(4, "Sneha", "IT", 80000),
(5, "Karan", "HR", 65000)
]
columns = ["EmpID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

  df: pyspark.sql.dataframe.DataFrame = [EmpID: long, Name: string ... 2 more fields]

# 1. Display Schema and First 3 Rows


df.printSchema()
df.show(3)

root
|-- EmpID: long (nullable = true)
|-- Name: string (nullable = true)
|-- Department: string (nullable = true)
|-- Salary: long (nullable = true)

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
| 1| Amit| IT| 60000|
| 2|Priya| HR| 55000|
| 3|Rahul| Finance| 75000|
+-----+-----+----------+------+
only showing top 3 rows

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 1/8
3/18/25, 6:43 PM Pyspark1 - Databricks

;
# 2. Filter Employees Earning More than 70,000
df.filter(col("Salary") > 70000).show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
| 3|Rahul| Finance| 75000|
| 4|Sneha| IT| 80000|
+-----+-----+----------+------+

# 3. Calculate Average Salary per Department


df.groupBy("Department").agg(avg("Salary").alias("Avg_Salary")).show()

+----------+----------+
|Department|Avg_Salary|
+----------+----------+
| IT| 70000.0|
| HR| 60000.0|
| Finance| 75000.0|
+----------+----------+

# 4. Find Employees whose Name Starts with 'A'


df.filter(col("Name").startswith("A")).show()

+-----+----+----------+------+
|EmpID|Name|Department|Salary|
+-----+----+----------+------+
| 1|Amit| IT| 60000|
+-----+----+----------+------+

# 5. Count the Number of Employees per Department


df.groupBy("Department").agg(count("EmpID").alias("Num_Employees")).show()

+----------+-------------+
|Department|Num_Employees|
+----------+-------------+
| IT| 2|
| HR| 2|
| Finance| 1|
+----------+-------------+

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 2/8
3/18/25, 6:43 PM Pyspark1 - Databricks

# 6. Add a New Column for Tax Deduction (10% of Salary)


df = df.withColumn("Tax", col("Salary") * 0.1)
df.show()

  df: pyspark.sql.dataframe.DataFrame = [EmpID: long, Name: string ... 3 more fields]


+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
| 3|Rahul| Finance| 75000|7500.0|
| 4|Sneha| IT| 80000|8000.0|
| 5|Karan| HR| 65000|6500.0|
+-----+-----+----------+------+------+

# 7. Sort Employees by Salary in Descending Order


df.orderBy(col("Salary").desc()).show()

+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 4|Sneha| IT| 80000|8000.0|
| 3|Rahul| Finance| 75000|7500.0|
| 5|Karan| HR| 65000|6500.0|
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
+-----+-----+----------+------+------+

10

# 8. Get the Second Highest Salary


second_highest_salary = df.select("Salary").distinct().orderBy(col("Salary").desc()).limit(2).collect()[-1][0]
df.filter(col("Salary") == second_highest_salary).show()

+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 3|Rahul| Finance| 75000|7500.0|
+-----+-----+----------+------+------+

11

# 9. Get Employees Who are in the HR or IT Department


df.filter(col("Department").isin(["HR", "IT"])).show()

+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
| 4|Sneha| IT| 80000|8000.0|
| 5|Karan| HR| 65000|6500.0|

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 3/8
3/18/25, 6:43 PM Pyspark1 - Databricks

+-----+-----+----------+------+------+

12

# 10. Find the Total Salary Paid by the Company


df.agg(sum("Salary").alias("Total_Salary")).show()

+------------+
|Total_Salary|
+------------+
| 335000|
+------------+

13

from pyspark.sql import SparkSession

# Create Spark session


spark = SparkSession.builder.appName("CricketPlayers").getOrCreate()

# Define data
data = [
("Virat Kohli", "India", 12000, 4),
("Rohit Sharma", "India", 11000, 8),
("Jasprit Bumrah", "India", 1200, 200),
("Steve Smith", "Australia", 9500, 20),
("David Warner", "Australia", 10500, 10),
("Ben Stokes", "England", 5000, 150),
("Joe Root", "England", 10000, 30),
]

# Define schema
columns = ["Player", "Country", "Runs", "Wickets"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Save DataFrame as CSV in Databricks


csv_path = "dbfs:/FileStore/tables/players.csv" # Databricks FileStore path
df.write.mode("overwrite").option("header", True).csv(csv_path)

# Display confirmation message


print(f"CSV file saved at: {csv_path}")

  df: pyspark.sql.dataframe.DataFrame = [Player: string, Country: string ... 2 more fields]


CSV file saved at: dbfs:/FileStore/tables/players.csv

14

# 11. Read a CSV File of Cricket Players


players_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/players.csv", inferSchema=True)
players_df.show()

  players_df: pyspark.sql.dataframe.DataFrame = [Player: string, Country: string ... 2 more fields]

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 4/8
3/18/25, 6:43 PM Pyspark1 - Databricks

+--------------+---------+-----+-------+
| Player| Country| Runs|Wickets|
+--------------+---------+-----+-------+
| David Warner|Australia|10500| 10|
|Jasprit Bumrah| India| 1200| 200|
| Steve Smith|Australia| 9500| 20|
| Ben Stokes| England| 5000| 150|
| Rohit Sharma| India|11000| 8|
| Virat Kohli| India|12000| 4|
| Joe Root| England|10000| 30|
+--------------+---------+-----+-------+

15

# 12. Find the Player with Maximum Runs


players_df.orderBy(col("Runs").desc()).limit(1).show()

+-----------+-------+-----+-------+
| Player|Country| Runs|Wickets|
+-----------+-------+-----+-------+
|Virat Kohli| India|12000| 4|
+-----------+-------+-----+-------+

16

# 13. Find the Average Runs Scored by Indian Players


players_df.filter(col("Country") == "India").agg(avg("Runs").alias("Avg_Runs")).show()

+-----------------+
| Avg_Runs|
+-----------------+
|8066.666666666667|
+-----------------+

17

# 14. Get Players Who Have Taken More than 50 Wickets


players_df.filter(col("Wickets") > 50).show()

+--------------+-------+----+-------+
| Player|Country|Runs|Wickets|
+--------------+-------+----+-------+
|Jasprit Bumrah| India|1200| 200|
| Ben Stokes|England|5000| 150|
+--------------+-------+----+-------+

18

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 5/8
3/18/25, 6:43 PM Pyspark1 - Databricks

from pyspark.sql import SparkSession

# Create Spark session


spark = SparkSession.builder.appName("IndianCities").getOrCreate()

# Define data
data = [
("Mumbai", "Maharashtra", 20000000),
("Delhi", "Delhi", 18000000),
("Bangalore", "Karnataka", 12000000),
("Hyderabad", "Telangana", 10000000),
("Chennai", "Tamil Nadu", 9000000),
("Kolkata", "West Bengal", 15000000),
("Pune", "Maharashtra", 7000000),
("Ahmedabad", "Gujarat", 8000000)
]

# Define schema
columns = ["City", "State", "Population"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Save DataFrame as JSON in Databricks


json_path = "dbfs:/FileStore/tables/cities.json" # Databricks FileStore path
df.write.mode("overwrite").json(json_path)

# Display confirmation message


print(f"JSON file saved at: {json_path}")

  df: pyspark.sql.dataframe.DataFrame = [City: string, State: string ... 1 more field]

JSON file saved at: dbfs:/FileStore/tables/cities.json

19

# 15. Read a JSON File Containing Indian Cities Population


cities_df = spark.read.option("multiline", "true").json("dbfs:/FileStore/tables/cities.json")
cities_df.show()

  cities_df: pyspark.sql.dataframe.DataFrame = [City: string, Population: long ... 1 more field]


+---------+----------+-----------+
| City|Population| State|
+---------+----------+-----------+
|Bangalore| 12000000| Karnataka|
|Hyderabad| 10000000| Telangana|
| Kolkata| 15000000|West Bengal|
| Mumbai| 20000000|Maharashtra|
| Chennai| 9000000| Tamil Nadu|
|Ahmedabad| 8000000| Gujarat|
| Pune| 7000000|Maharashtra|
| Delhi| 18000000| Delhi|
+---------+----------+-----------+

20

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 6/8
3/18/25, 6:43 PM Pyspark1 - Databricks

# 16. Find Cities with a Population Greater than 15 Million


cities_df.filter(col("Population") > 15000000).show()

+------+----------+-----------+
| City|Population| State|
+------+----------+-----------+
|Mumbai| 20000000|Maharashtra|
| Delhi| 18000000| Delhi|
+------+----------+-----------+

21

# 17. Calculate Total Population per State


cities_df.groupBy("State").agg(sum("Population").alias("Total_Population")).show()

+-----------+----------------+
| State|Total_Population|
+-----------+----------------+
| Karnataka| 12000000|
| Telangana| 10000000|
|West Bengal| 15000000|
|Maharashtra| 27000000|
| Tamil Nadu| 9000000|
| Gujarat| 8000000|
| Delhi| 18000000|
+-----------+----------------+

22

# 18. Find the State with the Highest Total Population


cities_df.groupBy("State").agg(sum("Population").alias("Total_Population")).orderBy(col("Total_Population").desc
()).limit(1).show()

+-----------+----------------+
| State|Total_Population|
+-----------+----------------+
|Maharashtra| 27000000|
+-----------+----------------+

23

City Population State


0 Bangalore 12000000 Karnataka
1 Hyderabad 10000000 Telangana
2 Kolkata 15000000 West Bengal
3 Mumbai 20000000 Maharashtra
4 Chennai 9000000 Tamil Nadu
5 Ahmedabad 8000000 Gujarat
6 Pune 7000000 Maharashtra
7 Delhi 18000000 Delhi

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 7/8
3/18/25, 6:43 PM Pyspark1 - Databricks

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 8/8

You might also like