from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from sparkxgb import XGBoostRegressor
spark = SparkSession.builder.appName("XGBoostRegressionExample").getOrCreate()
data = spark.read.csv(‘datafile.scv’, header=True, inferSchema=True)
feature_cols = [col for col in data.columns if col != "label"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(data)
train, test = assembled_data.randomSplit([0.8, 0.2], seed=42)
xgb = XGBoostRegressor(
featuresCol="features",
labelCol="label",
maxDepth=6,
n_estimators=100,
objective="reg:squarederror"
)
pipeline = Pipeline(stages=[assembler, xgb])
model = pipeline.fit(train)
predictions = model.transform(test)
evaluator_rmse = RegressionEvaluator(
labelCol="label",
predictionCol="prediction",
metricName="rmse" )
rmse = evaluator_rmse.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")