0% found this document useful (0 votes)

48 views6 pages

Spark

The document shows how to perform data exploration and linear regression on multiple datasets using Spark SQL and MLlib. It loads climate and disaster datasets, cleans and joins the data, builds a linear regression model to predict natural disasters from other variables, and evaluates the model's performance.

Uploaded by

sholyfila

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

48 views6 pages

Spark

Uploaded by

sholyfila

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 6

%spark

val sea_level_df = spark.sql("SELECT * FROM csiro_global_mean_sea_level_2013")

val fossil_fuel_df = spark.sql("SELECT * FROM global_carbon_emission_2018")

val temperature_df = spark.sql("SELECT * FROM global_temperature_2019")

val disasters_df = spark.sql("SELECT * FROM global_natural_disaster_events_2018")

%spark

// Assuming 'disasters_df' is your DataFrame

val natural_disasters_df = disasters_df.filter("Entity == 'All natural disasters'")

%spark

// Assuming 'df' is your DataFrame

val nat_disasters_df = natural_disasters_df.filter("Year >= 1959 AND Year <= 2013")

%spark

// Show the result

nat_disasters_df.show()

%spark

val fossil_fuels_df = fossil_fuel_df.filter("Year >= 1959 AND Year <= 2013")

%spark

fossil_fuels_df.show()

%spark

val global_sea_level_df = sea_level_df.filter("Year >= 1959 AND Year <= 2013")

%spark

global_sea_level_df.show()
%spark

val global_temperature_df = temperature_df.filter("Year >= 1959 AND Year <= 2013")

%spark

global_temperature_df.show()

%spark

val selected_disasters_df = nat_disasters_df.select("year", "total_natural_disasters")

selected_disasters_df.show()

%spark

val selected_fossil_fuels_df = fossil_fuels_df.select("year", "carbon_emissions")

selected_fossil_fuels_df.show()

%spark

val selected_sea_level_df = global_sea_level_df.select("year", "mean_sea_level", "month")

selected_sea_level_df.show()

%spark

val selected_temperature_df = global_temperature_df.select("year", "global_temperature",

"month")

selected_temperature_df.show()

%spark

val temp_df = selected_temperature_df.filter("month = 12")

val global_temp_df = temp_df.select("year", "global_temperature")

// Show the result

global_temp_df.show()
%spark

val sealevel_df = selected_sea_level_df.filter("month = 'Dec'")

val global_mean_sea_level_df = sealevel_df.select("year", "mean_sea_level")

// Show the result

global_mean_sea_level_df.show()

%spark

val table_df = global_temp_df

.join(selected_fossil_fuels_df, Seq("year"))

.join(global_mean_sea_level_df, Seq("year"))

.join(selected_disasters_df, Seq("year"))

// Show the result

table_df.show()

%spark

table_df.createOrReplaceTempView("table")

%sql

show tables

%sql

select

year,

global_temperature,

carbon_emissions,

LOG(mean_sea_level) AS log_mean_sea_level,

LOG(total_natural_disasters) AS log_total_natural_disasters

FROM table where year BETWEEN 1990 AND 2013

%spark

// Assuming `table_df` is your DataFrame

table_df.createOrReplaceTempView("my_table")

val plotData = spark.sql("""

SELECT carbon_emissions, total_natural_disasters

FROM my_table

""")

// Register the DataFrame as a temporary table for visualization

plotData.createOrReplaceTempView("plot_data")

%sql

SELECT * FROM plot_data

%sql

SELECT

year,

carbon_emissions

FROM

table where year BETWEEN 1990 AND 2013

%spark

// Import required Spark libraries

import org.apache.spark.ml.regression.LinearRegression

import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.Pipeline

import org.apache.spark.ml.evaluation.RegressionEvaluator
// Assuming 'df' is your DataFrame

val cleanedData = table_df.na.drop()

// Assemble features into a single vector column

val assembler = new VectorAssembler()

.setInputCols(Array("year", "global_temperature", "carbon_emissions", "mean_sea_level"))

.setOutputCol("features")

// Linear Regression model

val lr = new LinearRegression()

.setLabelCol("total_natural_disasters")

.setFeaturesCol("features")

// Create a pipeline

val pipeline = new Pipeline().setStages(Array(assembler, lr))

// Split the data into training and testing sets

val Array(trainingData, testData) = cleanedData.randomSplit(Array(0.8, 0.2))

// Fit the model to the training data

val model = pipeline.fit(trainingData)

// Make predictions on the test data

val predictions = model.transform(testData)

// Evaluate the model

val evaluator = new RegressionEvaluator()

.setLabelCol("total_natural_disasters")

.setPredictionCol("prediction")

.setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)

println(s"Root Mean Squared Error (RMSE) on test data: $rmse")

// Show the predictions

predictions.select("year", "total_natural_disasters", "prediction").show()

%spark

// Create a temporary view for predictions

predictions.createOrReplaceTempView("predictions_table")

// Query the predictions data

val predictionsTableDF = spark.sql("SELECT year, total_natural_disasters, prediction FROM

predictions_table")

%sql

SELECT

year,

total_natural_disasters,

prediction

FROM predictions_table

PySpark Cheatsheet - Elaborate
No ratings yet
PySpark Cheatsheet - Elaborate
14 pages
EDS - Python Cheat Sheet
0% (1)
EDS - Python Cheat Sheet
3 pages
PySpark ELT Cheat Sheet Guide
No ratings yet
PySpark ELT Cheat Sheet Guide
8 pages
Python Scripts For Machine Learning
No ratings yet
Python Scripts For Machine Learning
13 pages
PySpark Transformations
No ratings yet
PySpark Transformations
18 pages
Unit 6 Pyspark - MLlib
No ratings yet
Unit 6 Pyspark - MLlib
6 pages
Spark Entity Resolution with DataFrame Analysis
No ratings yet
Spark Entity Resolution with DataFrame Analysis
5 pages
Must Know Pyspark Coding Before Databricks Interview
No ratings yet
Must Know Pyspark Coding Before Databricks Interview
7 pages
Jamboree
No ratings yet
Jamboree
56 pages
PySpark Big Data Analytics Guide
No ratings yet
PySpark Big Data Analytics Guide
7 pages
Comparison of SQL
No ratings yet
Comparison of SQL
11 pages
Big Data Analytics in Apache Spark
No ratings yet
Big Data Analytics in Apache Spark
79 pages
Feature Engineering Cheat Sheet
No ratings yet
Feature Engineering Cheat Sheet
10 pages
2324 BigData Lab3
No ratings yet
2324 BigData Lab3
6 pages
Pyspark MLlib
No ratings yet
Pyspark MLlib
4 pages
Spark SQLPDF 20 Jan
No ratings yet
Spark SQLPDF 20 Jan
4 pages
ML Expt 1 Description
No ratings yet
ML Expt 1 Description
15 pages
Data Mining Lab Manaul
No ratings yet
Data Mining Lab Manaul
32 pages
Machine Learning With PySpark and MLlib - Solving A Binary Classification Problem - by Susan Li - Towards Data Science
No ratings yet
Machine Learning With PySpark and MLlib - Solving A Binary Classification Problem - by Susan Li - Towards Data Science
10 pages
Lab Record IP
No ratings yet
Lab Record IP
13 pages
Journal
No ratings yet
Journal
47 pages
UNITIV BtechIot
No ratings yet
UNITIV BtechIot
43 pages
SQL Cheat Sheet Python
100% (1)
SQL Cheat Sheet Python
1 page
Vertica ML Cheat Sheet for Data Scientists
No ratings yet
Vertica ML Cheat Sheet for Data Scientists
2 pages
A926534728 - 28953 - 8 - 2025 - Spark Mllib
No ratings yet
A926534728 - 28953 - 8 - 2025 - Spark Mllib
8 pages
Analysis of Heart Disease Dataset
No ratings yet
Analysis of Heart Disease Dataset
16 pages
Pyspark Tutorial 3
No ratings yet
Pyspark Tutorial 3
5 pages
Pandas Cheat Sheet Serves
No ratings yet
Pandas Cheat Sheet Serves
20 pages
EDA Python For Data Analsis
No ratings yet
EDA Python For Data Analsis
10 pages
12 IP Practial Programs 2025-26
No ratings yet
12 IP Practial Programs 2025-26
10 pages
Pandas DataFrames & Jupyter Guide
No ratings yet
Pandas DataFrames & Jupyter Guide
10 pages
ML Book Notes
No ratings yet
ML Book Notes
9 pages
Pyspark Funcamentals
No ratings yet
Pyspark Funcamentals
10 pages
PySpark DataFrame Operations Guide
No ratings yet
PySpark DataFrame Operations Guide
10 pages
Ebook Comandos JesusG 1741221641
No ratings yet
Ebook Comandos JesusG 1741221641
7 pages
SQL and PySpark
No ratings yet
SQL and PySpark
80 pages
Spark Commands
No ratings yet
Spark Commands
3 pages
Python For DS Cheat Sheet
100% (2)
Python For DS Cheat Sheet
6 pages
Lab Spark
No ratings yet
Lab Spark
3 pages
File Ip
No ratings yet
File Ip
22 pages
Ans Key Set A
No ratings yet
Ans Key Set A
6 pages
Classification Project 1691995218
No ratings yet
Classification Project 1691995218
43 pages
Spark Lab
No ratings yet
Spark Lab
6 pages
Pyspark Interview Questions
No ratings yet
Pyspark Interview Questions
4 pages
Delhivery Mani
No ratings yet
Delhivery Mani
79 pages
Pyspark Coding Questions From StrataScratch Platform
No ratings yet
Pyspark Coding Questions From StrataScratch Platform
23 pages
4 PySpark Exercises
No ratings yet
4 PySpark Exercises
7 pages
Copyy
No ratings yet
Copyy
4 pages
4 PythonPandas
No ratings yet
4 PythonPandas
8 pages
Solutions 1742312993
No ratings yet
Solutions 1742312993
14 pages
T15 Hand-On Solution Id 80827
No ratings yet
T15 Hand-On Solution Id 80827
2 pages
Class 12 Practical File Informatics Practices
No ratings yet
Class 12 Practical File Informatics Practices
28 pages
Top Datasets for Data Science
100% (1)
Top Datasets for Data Science
9 pages
Practical File IP
No ratings yet
Practical File IP
27 pages
Assignment
No ratings yet
Assignment
10 pages
Effects of Force of The Propellers On The Bearings Loads
No ratings yet
Effects of Force of The Propellers On The Bearings Loads
10 pages
Lecture 3 Values Development
No ratings yet
Lecture 3 Values Development
29 pages
Logarithms
No ratings yet
Logarithms
7 pages
Quick Revision 6: The Story of Gold
No ratings yet
Quick Revision 6: The Story of Gold
6 pages
Thesis Writing Support for Students
100% (3)
Thesis Writing Support for Students
7 pages
Office Layout
No ratings yet
Office Layout
3 pages
A Taste of Honey Essay Writing
No ratings yet
A Taste of Honey Essay Writing
2 pages
(L3) - (JLD 2.0) - Wave Optics - 21st Nov
No ratings yet
(L3) - (JLD 2.0) - Wave Optics - 21st Nov
53 pages
Roger A. Sheldon
No ratings yet
Roger A. Sheldon
3 pages
Magnetic Design: Yoke Optimization
No ratings yet
Magnetic Design: Yoke Optimization
51 pages
Magnetic Field
100% (1)
Magnetic Field
3 pages
W1 MICROSCOPE MATATAG Lesson Diary - Science 7
No ratings yet
W1 MICROSCOPE MATATAG Lesson Diary - Science 7
3 pages
Melville's Gothic & Baroque Vision
No ratings yet
Melville's Gothic & Baroque Vision
21 pages
Metagenomic Data Analysis 1071630717 9781071630716 Compress
No ratings yet
Metagenomic Data Analysis 1071630717 9781071630716 Compress
443 pages
European Tech Sovereignty
No ratings yet
European Tech Sovereignty
68 pages
Lab Worksheet - There's A Storm Coming
No ratings yet
Lab Worksheet - There's A Storm Coming
3 pages
FBM Notes Main
No ratings yet
FBM Notes Main
28 pages
ADL2601 2nd Sem 2025 Ass 1
No ratings yet
ADL2601 2nd Sem 2025 Ass 1
3 pages
Sustainable Development Social Science Project
No ratings yet
Sustainable Development Social Science Project
14 pages
Francesetti GecelePsychopatologyandDiagnosis
No ratings yet
Francesetti GecelePsychopatologyandDiagnosis
16 pages
Hhs M General Paper Outline
No ratings yet
Hhs M General Paper Outline
3 pages
Woods Et Al 2012
No ratings yet
Woods Et Al 2012
20 pages
Relics and Miracles: AB Blood Significance
No ratings yet
Relics and Miracles: AB Blood Significance
6 pages
Homework 7-4 Modern Chemistry Answers
100% (1)
Homework 7-4 Modern Chemistry Answers
8 pages
Choudhary Et Al 2018 Working Paper
No ratings yet
Choudhary Et Al 2018 Working Paper
48 pages
Jubilant FoodWorks Limited Sustainability Report 2022
No ratings yet
Jubilant FoodWorks Limited Sustainability Report 2022
2 pages
Lesson Plan 4 - Terminal Velocity
No ratings yet
Lesson Plan 4 - Terminal Velocity
3 pages
Edan H100N Manual
No ratings yet
Edan H100N Manual
130 pages
PV92 PCR Kit Manual
No ratings yet
PV92 PCR Kit Manual
100 pages

Spark

Uploaded by

Spark

Uploaded by

%spark

val sea_level_df = spark.sql("SELECT * FROM csiro_global_mean_sea_level_2013")

val fossil_fuel_df = spark.sql("SELECT * FROM global_carbon_emission_2018")

val temperature_df = spark.sql("SELECT * FROM global_temperature_2019")

val disasters_df = spark.sql("SELECT * FROM global_natural_disaster_events_2018")

// Assuming 'disasters_df' is your DataFrame

val natural_disasters_df = disasters_df.filter("Entity == 'All natural disasters'")

// Assuming 'df' is your DataFrame

val nat_disasters_df = natural_disasters_df.filter("Year >= 1959 AND Year <= 2013")

// Show the result

val fossil_fuels_df = fossil_fuel_df.filter("Year >= 1959 AND Year <= 2013")

val global_sea_level_df = sea_level_df.filter("Year >= 1959 AND Year <= 2013")

val global_temperature_df = temperature_df.filter("Year >= 1959 AND Year <= 2013")

val selected_disasters_df = nat_disasters_df.select("year", "total_natural_disasters")

val selected_fossil_fuels_df = fossil_fuels_df.select("year", "carbon_emissions")

val selected_sea_level_df = global_sea_level_df.select("year", "mean_sea_level", "month")

val selected_temperature_df = global_temperature_df.select("year", "global_temperature",

val temp_df = selected_temperature_df.filter("month = 12")

val global_temp_df = temp_df.select("year", "global_temperature")

// Show the result

val sealevel_df = selected_sea_level_df.filter("month = 'Dec'")

val global_mean_sea_level_df = sealevel_df.select("year", "mean_sea_level")

// Show the result

val table_df = global_temp_df

// Show the result

FROM table where year BETWEEN 1990 AND 2013

// Assuming `table_df` is your DataFrame

val plotData = spark.sql("""

SELECT carbon_emissions, total_natural_disasters

// Register the DataFrame as a temporary table for visualization

SELECT * FROM plot_data

table where year BETWEEN 1990 AND 2013

// Import required Spark libraries

val cleanedData = table_df.na.drop()

// Assemble features into a single vector column

val assembler = new VectorAssembler()

.setInputCols(Array("year", "global_temperature", "carbon_emissions", "mean_sea_level"))

// Linear Regression model

val lr = new LinearRegression()

val pipeline = new Pipeline().setStages(Array(assembler, lr))

// Split the data into training and testing sets

val Array(trainingData, testData) = cleanedData.randomSplit(Array(0.8, 0.2))

// Fit the model to the training data

val model = pipeline.fit(trainingData)

// Make predictions on the test data

val predictions = model.transform(testData)

// Evaluate the model

val evaluator = new RegressionEvaluator()

println(s"Root Mean Squared Error (RMSE) on test data: $rmse")

// Show the predictions

predictions.select("year", "total_natural_disasters", "prediction").show()

// Create a temporary view for predictions

// Query the predictions data

val predictionsTableDF = spark.sql("SELECT year, total_natural_disasters, prediction FROM

You might also like