Scenario Based Interview
Pyspark vs
  Spark SQL
         Ganesh. R
#Problem Statement Product recommendation. Just the basic type (“customers who bought this
also bought…”). That, in its simplest form, is an outcome of basket analysis. In this solution, i
will learn how to find products which are most frequently bought together using simple SQL.
Based on the history ecommerce website can recommend products to new user.
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType,
StringType
# Initialize Spark session
spark = SparkSession.builder \
    .appName("OrdersProducts") \
    .getOrCreate()
# Define schema for orders
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True)
])
# Define schema for products
products_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])
# Create data for orders
orders_data = [
    (1, 1, 1),
    (1, 1, 2),
    (1, 1, 3),
    (2, 2, 1),
    (2, 2, 2),
    (2, 2, 4),
    (3, 1, 5)
]
# Create data for products
products_data = [
    (1, 'A'),
    (2, 'B'),
    (3, 'C'),
    (4, 'D'),
    (5, 'E')
]
# Create DataFrame for orders
orders_df = spark.createDataFrame(orders_data, schema=orders_schema)
# Create DataFrame for products
products_df = spark.createDataFrame(products_data,
schema=products_schema)
# Show the result
orders_df.display()
products_df.display()
# Create temporary views for SQL queries
orders_df.createOrReplaceTempView("orders")
products_df.createOrReplaceTempView("products")
###Pyspark
from pyspark.sql.functions import col, concat,
monotonically_increasing_id, row_number, countDistinct
from pyspark.sql.window import Window
# Alias the orders DataFrame for joining
a = orders_df.alias("a")
b = orders_df.alias("b")
# Perform the join and necessary transformations
t1 = a.join(b, (a.order_id == b.order_id) & (a.product_id !=
b.product_id)) \
    .join(products_df.alias("p1"), col("a.product_id") ==
col("p1.id"), "left") \
    .join(products_df.alias("p2"), col("b.product_id") ==
col("p2.id"), "left") \
    .select(
        col("a.order_id").alias("order_id"),
        col("a.customer_id").alias("customer_id"),
        col("p1.name").alias("name1"),
        col("p2.name").alias("name2"),
        (col("p1.id") + col("p2.id")).alias("pair_sum"),
        monotonically_increasing_id().alias("idf")
    )
# Define window specification for row_number
window_spec = Window.partitionBy("order_id",
"pair_sum").orderBy("idf")
# Apply row_number function to filter duplicates
t2 = t1.withColumn("rnk", row_number().over(window_spec))
# Filter rows to keep only the first occurrence of each pair_sum
within each order
t3 = t2.filter(col("rnk") == 1) \
    .withColumn("pair", concat(col("name1"), col("name2")))
# Perform final aggregation
result_df = t3.groupBy("pair") \
    .agg(countDistinct("order_id").alias("frequency")) \
    .orderBy(col("frequency").desc())
# Show the result
result_df.display()
###Spark SQL
%sql
with t1 as (
Select a.order_id,a.customer_id,p1.name as name1,p2.name as name2,
(p1.id+p2.id) as pair_sum,monotonically_increasing_id() as idf
from orders a
inner join orders b on a.order_id = b.order_id and
a.product_id<>b.product_id
left join products p1 on a.product_id = p1.id
left join products p2 on b.product_id = p2.id
)
, t2 as (
Select order_id,customer_id,name1,name2,pair_sum, row_number()
over(partition by order_id,pair_sum order by idf asc ) as rnk
from t1
), t3 as (
Select *,
concat(name1, ' ',name2) as pair
from t2 where rnk=1
)
Select
pair,count(distinct order_id) as frequency
from t3
group by pair
order by 2 desc
 IF YOU FOUND
   THIS POST
USEFUL, PLEASE
    SAVE IT.
         Ganesh. R
+91-9030485102.                   Hyderabad, Telangana.             rganesh0203@gmail.com
https://medium.com/@rganesh0203        https://rganesh203.github.io/Portfolio/
https://github.com/rganesh203.        https://www.linkedin.com/in/r-ganesh-a86418155/
https://www.instagram.com/rg_data_talks/      https://topmate.io/ganesh_r0203