PySpark Cheatsheet
/tldr: Everything you type 100× per day
2025 Edition
Zero Fluff
Copy-Paste Ready
Session & I/O
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("demo") \
.config("spark.sql.adaptive.enabled", "true") \
.getOrCreate()
# Read
df = spark.read.parquet("s3a://bucket/path/")
df = spark.read.format("delta").load("/path")
df = spark.read.json("path/*.json", multiLine=True)
# Write
df.write.mode("overwrite").partitionBy("date").parquet("s3a://...")
df.write.format("delta").mode("append").save("/table")
# Common options
.option("header", "true")
.option("inferSchema", "true")
.option("badRecordsPath", "/_bad/")
.option("mergeSchema", "true")
# Cache
df.cache()
df.persist() # default STORAGE_LEVEL = MEMORY_AND_DISK
df.unpersist()
# Show
df.show(5, truncate=False)
df.printSchema()
df.explain(extended=True)
Core Transformations
# Columns
df.select("id", "name")
df.select(col("age"), (col("salary")*1.1).alias("new"))
df.withColumn("bonus", col("salary")*0.1)
df.withColumnRenamed("old", "new")
df.drop("temp_col")
# Filtering
df.filter("age > 30")
df.where(col("country").isin("US","UK"))
df.filter(col("date") >= "2024-01-01")
# Sorting
df.orderBy(desc("salary"))
df.sort(col("date").desc(), "name")
# Aggregations
from pyspark.sql.functions import *
df.groupBy("country") \
.agg(count("*").alias("cnt"),
sum("salary").alias("total"),
avg("salary"),
max("date"))
df.rollup("country","city").agg(sum("salary"))
Joins & Sets
# Joins
df1.join(df2, "id", "inner")
df1.join(df2, col("df1.id") == col("df2.user_id"), "left")
df1.join(broadcast(df2), "id") # hint
# Set operations
df1.union(df2) # unionAll in old versions
df1.unionByName(df2, allowMissingColumns=True)
df1.intersect(df2)
df1.exceptAll(df2)
# Dedupe
df.dropDuplicates(["id","date"])
df.distinct()
# Sample
df.sample(fraction=0.01, seed=42)
df.limit(100)
Window Functions (Most Used)
from pyspark.sql.window import Window
w = Window.partitionBy("country").orderBy(desc("salary"))
w_rows = Window.partitionBy("country").rowsBetween(-2, 0)
df.withColumn("rank", rank().over(w))
.withColumn("dense_rank", dense_rank().over(w))
.withColumn("row_num", row_number().over(w))
.withColumn("lag", lag("salary", 1).over(w))
.withColumn("lead", lead("salary", 1).over(w))
.withColumn("running_sum", sum("salary").over(w.rowsBetween(Window.unboundedPreceding, 0)))
.withColumn("mov_avg", avg("salary").over(w_rows))
String & Date Magic
# Strings
col("name").substr(1, 3)
regexp_replace(col("phone"), "[^0-9]", "")
split(col("tags"), ",")
array_join(col("tags_array"), "|")
translate(col("text"), "aeiou", "12345")
# Dates
to_date(col("ts"))
to_timestamp(col("ts"), "yyyy-MM-dd HH:mm")
date_add(col("date"), 7)
datediff(current_date(), col("birth"))
date_format(col("ts"), "yyyy-MM")
# Conditional
when(col("age") < 18, "minor") \
.when(col("age") < 65, "adult") \
.otherwise("senior")
coalesce(col("new"), col("old"), lit("unknown"))
# Nulls
df.dropna(subset=["id"])
df.fillna({"age": 0, "name": "unknown"})
df.na.drop("any") # vs "all"
UDFs (Use Sparingly)
# Regular UDF (slow)
from pyspark.sql.functions import udf
@udf(returnType=StringType())
def clean(text): return text.strip().lower()
# Pandas UDF (vectorized – 10–100× faster)
from pyspark.sql.functions import pandas_udf
import pandas as pd
@pandas_udf(StringType())
def clean_vec(texts: pd.Series) -> pd.Series:
return texts.str.strip().str.lower()
df.withColumn("clean", clean_vec(col("text")))
Debug & Collect Safely
# Safe collect
df.limit(10).collect()
df.take(5)
df.head(3)
# To pandas (only small data!)
df.limit(1000).toPandas()
# Debug
df.explain(True) # full plan
spark.conf.get("spark.sql.adaptive.enabled")
You now speak fluent PySpark.
Copy → Paste → Ship