🔥🔥🔥SQL SOLUTIONS :-- 🔥🔥🔥
I am hiring a PySpark Data Engineer! If you can answer
the following questions :
--====================
CREATE TABLE ##DATA(EmpID INT, FName
VARCHAR (50), Department VARCHAR (50),
Salary INT)
INSERT INTO ##DATA VALUES
(1, 'Amit', 'IT', 60000),
(2, 'Priya', 'HR', 55000),
(3, 'Rahul', 'Finance', 75000),
(4, 'Sneha', 'IT', 80000),
(5, 'Karan', 'HR', 65000)
--1.Task: Display the schema and first 3
rows.
SELECT TOP 3 * FROM ##DATA ORDER BY 1
---2. Filter Employees Earning More than
70,000
SELECT * FROM ##DATA WHERE Salary>70000
--3. Calculate Average Salary per
Department
SELECT Department,AVG(Salary) FROM ##DATA
GROUP BY Department
--4. Find Employees whose Name Starts with
'A'
SELECT * FROM ##DATA WHERE
SUBSTRING(FName,1,1)='A'
--5. Count the Number of Employees per
Department
SELECT Department,COUNT(FName)CNT FROM
##DATA GROUP BY Department
--6. Add a New Column for Tax Deduction
(10% of Salary)
SELECT *, (Salary*10)/100 AS TAX FROM
##DATA
--7. Sort Employees by Salary in
Descending Order
SELECT * FROM ##DATA ORDER BY Salary
DESC,FName
--8. Get the Second Highest Salary
SELECT TOP 1 * FROM ##DATA WHERE
Salary<(SELECT MAX(SALARY) FROM ##DATA)
ORDER BY Salary DESC
--9. Get Employees Who are in the HR or IT
Department
SELECT * FROM ##DATA WHERE Department IN
('HR','IT')
--10. Find the Total Salary Paid by the
Company
SELECT *,SUM(SALARY)OVER(ORDER BY
EMPID)TOTALSALARY FROM ##DATA
--11. Read a CSV File of Cricket Players
CREATE TABLE ##players(Player VARCHAR
(50),Country VARCHAR (50) ,Runs INT
,Wickets INT)
INSERT INTO ##players VALUES
('Virat Kohli','India',12000,4),
('Rohit Sharma','India',11000,8),
('Jasprit Bumrah','India',1200,200),
('Steve Smith','Australia',9500,20)
--Task: Read this CSV file into a
DataFrame and display its contents.
--12. Find the Player with Maximum Runs
SELECT TOP 1 * FROM ##players ORDER BY
RUNS DESC
---ACCORDING TO DATAFRAME
SELECT MAX(PLAYER),MAX(RUNS) FROM
##players
--13. Find the Average Runs Scored by
Indian Players
SELECT *,AVG(RUNS)OVER(ORDER BY PLAYER)
FROM ##players WHERE COUNTRY='INDIA'
---Find the Average Runs Scored by Indian
Players
SELECT COUNTRY,AVG(RUNS) RUNS FROM
##PLAYERS WHERE COUNTRY='INDIA' GROUP BY
COUNTRY
--14. Get Players Who Have Taken More than
50 Wickets
SELECT * FROM ##PLAYERS WHERE Wickets>50
--15. Read a JSON File Containing Indian
Cities Population
CREATE TABLE ##cities(City VARCHAR
(50),State VARCHAR (50),Population INT)
INSERT INTO ##cities VALUES
('Mumbai','Maharashtra','20000000'),
('Delhi','Delhi','18000000'),
('Bangalore','Karnataka','12000000'),
('Hyderabad','Telangana','10000000')
--Task: Read this JSON file into a
DataFrame and display its contents.
SELECT * FROM ##cities
--16. Find Cities with a Population
Greater than 15 Million
SELECT * FROM ##cities WHERE
Population>15000000
--17. Calculate Total Population per State
SELECT
STATE,SUM(Population)TOTALPopulation FROM
##cities GROUP BY STATE
--18. Find the State with the Highest
Total Population
SELECT
MAX(STATE)STATE,MAX(Population)Population
FROM ##cities
--19. Convert a DataFrame to Pandas
3/18/25, 6:43 PM Pyspark1 - Databricks
Pyspark1 (Python)
Import notebook
from pyspark.sql.functions import col, avg, count, sum, max
# Sample Data
data = [
(1, "Amit", "IT", 60000),
(2, "Priya", "HR", 55000),
(3, "Rahul", "Finance", 75000),
(4, "Sneha", "IT", 80000),
(5, "Karan", "HR", 65000)
]
columns = ["EmpID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df: pyspark.sql.dataframe.DataFrame = [EmpID: long, Name: string ... 2 more fields]
# 1. Display Schema and First 3 Rows
df.printSchema()
df.show(3)
root
|-- EmpID: long (nullable = true)
|-- Name: string (nullable = true)
|-- Department: string (nullable = true)
|-- Salary: long (nullable = true)
+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
| 1| Amit| IT| 60000|
| 2|Priya| HR| 55000|
| 3|Rahul| Finance| 75000|
+-----+-----+----------+------+
only showing top 3 rows
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 1/8
3/18/25, 6:43 PM Pyspark1 - Databricks
;
# 2. Filter Employees Earning More than 70,000
df.filter(col("Salary") > 70000).show()
+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
| 3|Rahul| Finance| 75000|
| 4|Sneha| IT| 80000|
+-----+-----+----------+------+
# 3. Calculate Average Salary per Department
df.groupBy("Department").agg(avg("Salary").alias("Avg_Salary")).show()
+----------+----------+
|Department|Avg_Salary|
+----------+----------+
| IT| 70000.0|
| HR| 60000.0|
| Finance| 75000.0|
+----------+----------+
# 4. Find Employees whose Name Starts with 'A'
df.filter(col("Name").startswith("A")).show()
+-----+----+----------+------+
|EmpID|Name|Department|Salary|
+-----+----+----------+------+
| 1|Amit| IT| 60000|
+-----+----+----------+------+
# 5. Count the Number of Employees per Department
df.groupBy("Department").agg(count("EmpID").alias("Num_Employees")).show()
+----------+-------------+
|Department|Num_Employees|
+----------+-------------+
| IT| 2|
| HR| 2|
| Finance| 1|
+----------+-------------+
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 2/8
3/18/25, 6:43 PM Pyspark1 - Databricks
# 6. Add a New Column for Tax Deduction (10% of Salary)
df = df.withColumn("Tax", col("Salary") * 0.1)
df.show()
df: pyspark.sql.dataframe.DataFrame = [EmpID: long, Name: string ... 3 more fields]
+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
| 3|Rahul| Finance| 75000|7500.0|
| 4|Sneha| IT| 80000|8000.0|
| 5|Karan| HR| 65000|6500.0|
+-----+-----+----------+------+------+
# 7. Sort Employees by Salary in Descending Order
df.orderBy(col("Salary").desc()).show()
+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 4|Sneha| IT| 80000|8000.0|
| 3|Rahul| Finance| 75000|7500.0|
| 5|Karan| HR| 65000|6500.0|
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
+-----+-----+----------+------+------+
10
# 8. Get the Second Highest Salary
second_highest_salary = df.select("Salary").distinct().orderBy(col("Salary").desc()).limit(2).collect()[-1][0]
df.filter(col("Salary") == second_highest_salary).show()
+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 3|Rahul| Finance| 75000|7500.0|
+-----+-----+----------+------+------+
11
# 9. Get Employees Who are in the HR or IT Department
df.filter(col("Department").isin(["HR", "IT"])).show()
+-----+-----+----------+------+------+
|EmpID| Name|Department|Salary| Tax|
+-----+-----+----------+------+------+
| 1| Amit| IT| 60000|6000.0|
| 2|Priya| HR| 55000|5500.0|
| 4|Sneha| IT| 80000|8000.0|
| 5|Karan| HR| 65000|6500.0|
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 3/8
3/18/25, 6:43 PM Pyspark1 - Databricks
+-----+-----+----------+------+------+
12
# 10. Find the Total Salary Paid by the Company
df.agg(sum("Salary").alias("Total_Salary")).show()
+------------+
|Total_Salary|
+------------+
| 335000|
+------------+
13
from pyspark.sql import SparkSession
# Create Spark session
spark = SparkSession.builder.appName("CricketPlayers").getOrCreate()
# Define data
data = [
("Virat Kohli", "India", 12000, 4),
("Rohit Sharma", "India", 11000, 8),
("Jasprit Bumrah", "India", 1200, 200),
("Steve Smith", "Australia", 9500, 20),
("David Warner", "Australia", 10500, 10),
("Ben Stokes", "England", 5000, 150),
("Joe Root", "England", 10000, 30),
]
# Define schema
columns = ["Player", "Country", "Runs", "Wickets"]
# Create DataFrame
df = spark.createDataFrame(data, columns)
# Save DataFrame as CSV in Databricks
csv_path = "dbfs:/FileStore/tables/players.csv" # Databricks FileStore path
df.write.mode("overwrite").option("header", True).csv(csv_path)
# Display confirmation message
print(f"CSV file saved at: {csv_path}")
df: pyspark.sql.dataframe.DataFrame = [Player: string, Country: string ... 2 more fields]
CSV file saved at: dbfs:/FileStore/tables/players.csv
14
# 11. Read a CSV File of Cricket Players
players_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/players.csv", inferSchema=True)
players_df.show()
players_df: pyspark.sql.dataframe.DataFrame = [Player: string, Country: string ... 2 more fields]
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 4/8
3/18/25, 6:43 PM Pyspark1 - Databricks
+--------------+---------+-----+-------+
| Player| Country| Runs|Wickets|
+--------------+---------+-----+-------+
| David Warner|Australia|10500| 10|
|Jasprit Bumrah| India| 1200| 200|
| Steve Smith|Australia| 9500| 20|
| Ben Stokes| England| 5000| 150|
| Rohit Sharma| India|11000| 8|
| Virat Kohli| India|12000| 4|
| Joe Root| England|10000| 30|
+--------------+---------+-----+-------+
15
# 12. Find the Player with Maximum Runs
players_df.orderBy(col("Runs").desc()).limit(1).show()
+-----------+-------+-----+-------+
| Player|Country| Runs|Wickets|
+-----------+-------+-----+-------+
|Virat Kohli| India|12000| 4|
+-----------+-------+-----+-------+
16
# 13. Find the Average Runs Scored by Indian Players
players_df.filter(col("Country") == "India").agg(avg("Runs").alias("Avg_Runs")).show()
+-----------------+
| Avg_Runs|
+-----------------+
|8066.666666666667|
+-----------------+
17
# 14. Get Players Who Have Taken More than 50 Wickets
players_df.filter(col("Wickets") > 50).show()
+--------------+-------+----+-------+
| Player|Country|Runs|Wickets|
+--------------+-------+----+-------+
|Jasprit Bumrah| India|1200| 200|
| Ben Stokes|England|5000| 150|
+--------------+-------+----+-------+
18
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 5/8
3/18/25, 6:43 PM Pyspark1 - Databricks
from pyspark.sql import SparkSession
# Create Spark session
spark = SparkSession.builder.appName("IndianCities").getOrCreate()
# Define data
data = [
("Mumbai", "Maharashtra", 20000000),
("Delhi", "Delhi", 18000000),
("Bangalore", "Karnataka", 12000000),
("Hyderabad", "Telangana", 10000000),
("Chennai", "Tamil Nadu", 9000000),
("Kolkata", "West Bengal", 15000000),
("Pune", "Maharashtra", 7000000),
("Ahmedabad", "Gujarat", 8000000)
]
# Define schema
columns = ["City", "State", "Population"]
# Create DataFrame
df = spark.createDataFrame(data, columns)
# Save DataFrame as JSON in Databricks
json_path = "dbfs:/FileStore/tables/cities.json" # Databricks FileStore path
df.write.mode("overwrite").json(json_path)
# Display confirmation message
print(f"JSON file saved at: {json_path}")
df: pyspark.sql.dataframe.DataFrame = [City: string, State: string ... 1 more field]
JSON file saved at: dbfs:/FileStore/tables/cities.json
19
# 15. Read a JSON File Containing Indian Cities Population
cities_df = spark.read.option("multiline", "true").json("dbfs:/FileStore/tables/cities.json")
cities_df.show()
cities_df: pyspark.sql.dataframe.DataFrame = [City: string, Population: long ... 1 more field]
+---------+----------+-----------+
| City|Population| State|
+---------+----------+-----------+
|Bangalore| 12000000| Karnataka|
|Hyderabad| 10000000| Telangana|
| Kolkata| 15000000|West Bengal|
| Mumbai| 20000000|Maharashtra|
| Chennai| 9000000| Tamil Nadu|
|Ahmedabad| 8000000| Gujarat|
| Pune| 7000000|Maharashtra|
| Delhi| 18000000| Delhi|
+---------+----------+-----------+
20
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 6/8
3/18/25, 6:43 PM Pyspark1 - Databricks
# 16. Find Cities with a Population Greater than 15 Million
cities_df.filter(col("Population") > 15000000).show()
+------+----------+-----------+
| City|Population| State|
+------+----------+-----------+
|Mumbai| 20000000|Maharashtra|
| Delhi| 18000000| Delhi|
+------+----------+-----------+
21
# 17. Calculate Total Population per State
cities_df.groupBy("State").agg(sum("Population").alias("Total_Population")).show()
+-----------+----------------+
| State|Total_Population|
+-----------+----------------+
| Karnataka| 12000000|
| Telangana| 10000000|
|West Bengal| 15000000|
|Maharashtra| 27000000|
| Tamil Nadu| 9000000|
| Gujarat| 8000000|
| Delhi| 18000000|
+-----------+----------------+
22
# 18. Find the State with the Highest Total Population
cities_df.groupBy("State").agg(sum("Population").alias("Total_Population")).orderBy(col("Total_Population").desc
()).limit(1).show()
+-----------+----------------+
| State|Total_Population|
+-----------+----------------+
|Maharashtra| 27000000|
+-----------+----------------+
23
City Population State
0 Bangalore 12000000 Karnataka
1 Hyderabad 10000000 Telangana
2 Kolkata 15000000 West Bengal
3 Mumbai 20000000 Maharashtra
4 Chennai 9000000 Tamil Nadu
5 Ahmedabad 8000000 Gujarat
6 Pune 7000000 Maharashtra
7 Delhi 18000000 Delhi
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 7/8
3/18/25, 6:43 PM Pyspark1 - Databricks
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/1414938029622029/181382969102664/9730… 8/8