PySpark Cheatsheet TL;DR
Back to Apache Spark TL;DR Hub

PySpark Cheatsheet

/tldr: Everything you type 100× per day

2025 Edition Zero Fluff Copy-Paste Ready

Session & I/O

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("demo") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

# Read
df = spark.read.parquet("s3a://bucket/path/")
df = spark.read.format("delta").load("/path")
df = spark.read.json("path/*.json", multiLine=True)

# Write
df.write.mode("overwrite").partitionBy("date").parquet("s3a://...")
df.write.format("delta").mode("append").save("/table")
# Common options
.option("header", "true")
.option("inferSchema", "true")
.option("badRecordsPath", "/_bad/")
.option("mergeSchema", "true")

# Cache
df.cache()
df.persist()                    # default STORAGE_LEVEL = MEMORY_AND_DISK
df.unpersist()

# Show
df.show(5, truncate=False)
df.printSchema()
df.explain(extended=True)

Core Transformations

# Columns
df.select("id", "name")
df.select(col("age"), (col("salary")*1.1).alias("new"))
df.withColumn("bonus", col("salary")*0.1)
df.withColumnRenamed("old", "new")
df.drop("temp_col")
# Filtering
df.filter("age > 30")
df.where(col("country").isin("US","UK"))
df.filter(col("date") >= "2024-01-01")

# Sorting
df.orderBy(desc("salary"))
df.sort(col("date").desc(), "name")
# Aggregations
from pyspark.sql.functions import *

df.groupBy("country") \
  .agg(count("*").alias("cnt"),
       sum("salary").alias("total"),
       avg("salary"),
       max("date"))

df.rollup("country","city").agg(sum("salary"))

Joins & Sets

# Joins
df1.join(df2, "id", "inner")
df1.join(df2, col("df1.id") == col("df2.user_id"), "left")
df1.join(broadcast(df2), "id")          # hint

# Set operations
df1.union(df2)          # unionAll in old versions
df1.unionByName(df2, allowMissingColumns=True)
df1.intersect(df2)
df1.exceptAll(df2)
# Dedupe
df.dropDuplicates(["id","date"])
df.distinct()

# Sample
df.sample(fraction=0.01, seed=42)
df.limit(100)

Window Functions (Most Used)

from pyspark.sql.window import Window

w = Window.partitionBy("country").orderBy(desc("salary"))
w_rows = Window.partitionBy("country").rowsBetween(-2, 0)

df.withColumn("rank", rank().over(w))
  .withColumn("dense_rank", dense_rank().over(w))
  .withColumn("row_num", row_number().over(w))
  .withColumn("lag", lag("salary", 1).over(w))
  .withColumn("lead", lead("salary", 1).over(w))
  .withColumn("running_sum", sum("salary").over(w.rowsBetween(Window.unboundedPreceding, 0)))
  .withColumn("mov_avg", avg("salary").over(w_rows))

String & Date Magic

# Strings
col("name").substr(1, 3)
regexp_replace(col("phone"), "[^0-9]", "")
split(col("tags"), ",")
array_join(col("tags_array"), "|")
translate(col("text"), "aeiou", "12345")

# Dates
to_date(col("ts"))
to_timestamp(col("ts"), "yyyy-MM-dd HH:mm")
date_add(col("date"), 7)
datediff(current_date(), col("birth"))
date_format(col("ts"), "yyyy-MM")
# Conditional
when(col("age") < 18, "minor") \
.when(col("age") < 65, "adult") \
.otherwise("senior")

coalesce(col("new"), col("old"), lit("unknown"))

# Nulls
df.dropna(subset=["id"])
df.fillna({"age": 0, "name": "unknown"})
df.na.drop("any")        # vs "all"

UDFs (Use Sparingly)

# Regular UDF (slow)
from pyspark.sql.functions import udf
@udf(returnType=StringType())
def clean(text): return text.strip().lower()

# Pandas UDF (vectorized – 10–100× faster)
from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf(StringType())
def clean_vec(texts: pd.Series) -> pd.Series:
    return texts.str.strip().str.lower()

df.withColumn("clean", clean_vec(col("text")))

Debug & Collect Safely

# Safe collect
df.limit(10).collect()
df.take(5)
df.head(3)

# To pandas (only small data!)
df.limit(1000).toPandas()

# Debug
df.explain(True)        # full plan
spark.conf.get("spark.sql.adaptive.enabled")

You now speak fluent PySpark.

Copy → Paste → Ship

PySpark 3.5+ • Works on Databricks, EMR, GCP, Azure • Updated 2025