aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore12
-rw-r--r--Makefile19
-rw-r--r--README.md4
-rw-r--r--docker-compose.yml13
-rw-r--r--docker/Dockerfile60
-rw-r--r--main.py9
-rw-r--r--requirements.txt1
7 files changed, 118 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..696c85c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+.*.swp
+*.pyc
+*.pyo
+.DS_Store
+tags
+.ropeproject
+*.actual
+.vimcache
+.idea
+.mypy_cache
+.envrc
+*.sqlite
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4a793ab
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: build
+
+help:
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+build: ## Build the Docker image
+ docker-compose -p docker-spark build
+
+up: build ## Bring the container up
+ docker-compose -p docker-spark up -d
+
+down: ## Stop the container
+ docker-compose -p docker-spark stop
+
+enter: ## Enter the running container
+ docker-compose -p docker-spark exec backend /bin/bash
+
+clean: down ## Remove stoped containers
+ docker-compose -p docker-spark rm -f
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..be6b017
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+# Docker Spark
+
+Mostly used to figure out how to install spark into a container (after spending so long doing it).
+You can build, enter and run `python main.py` to see the example run.
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..66d2557
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3'
+services:
+ backend:
+ build:
+ context: ./
+ dockerfile: ./docker/Dockerfile
+ container_name: docker-spark
+ image: thornycrackers/docker-spark
+ volumes:
+ - .:/usr/src/app
+ command: /bin/bash
+ tty: true
+ stdin_open: true
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..02103dc
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,60 @@
+FROM python:3.9-bullseye
+
+# Set working directory
+WORKDIR /usr/src/app
+
+# software-properties-common so we can use add-apt-repository
+RUN apt-get update && \
+ apt-get install --no-install-recommends -y \
+ software-properties-common \
+ && rm -rf /var/lib/apt/lists/*
+
+# Java 8 install
+RUN wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add -
+RUN add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/
+RUN apt-get update && \
+ apt-get install --no-install-recommends -y \
+ adoptopenjdk-8-hotspot \
+ && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/adoptopenjdk-8-hotspot-amd64/
+
+# Install python dependencies
+RUN pip install --upgrade pip wheel
+COPY requirements.txt requirements.txt
+RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
+
+# Install Spark
+RUN wget https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-without-hadoop.tgz
+RUN tar zxvf spark-3.1.2-bin-without-hadoop.tgz
+RUN mv spark-3.1.2-bin-without-hadoop /opt/
+ENV SPARK_HOME=/opt/spark-3.1.2-bin-without-hadoop
+ENV PATH=$SPARK_HOME/bin:$PATH
+
+# You may ask: But you just downloaded spark without hadoop? Why are you
+# installing hadoop? https://spark.apache.org/docs/latest/hadoop-provided.html
+# Spark uses Hadoop client libraries for HDFS and YARN.
+
+# Get Hadoop
+RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz
+RUN tar zxvf hadoop-3.2.1.tar.gz
+RUN mv hadoop-3.2.1 /opt/
+# Hadoop variables https://phoenixnap.com/kb/install-hadoop-ubuntu
+ENV HADOOP_HOME=/opt/hadoop-3.2.1
+ENV PATH=$PATH:$HADOOP_HOME/bin
+ENV PATH=$PATH:$HADOOP_HOME/sbin
+ENV HADOOP_MAPRED_HOME=$HADOOP_HOME
+ENV HADOOP_COMMON_HOME=$HADOOP_HOME
+ENV HADOOP_HDFS_HOME=$HADOOP_HOME
+ENV YARN_HOME=$HADOOP_HOME
+ENV HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
+
+# SPARK_DIST_CLASSPATH=$(hadoop classpath)
+# Docker doesn't have a great was to export environment variables dynamically
+# so if you change the version of hadoop this will need to be updated as well.
+ENV SPARK_DIST_CLASSPATH=/opt/hadoop-3.2.1/etc/hadoop:/opt/hadoop-3.2.1/share/hadoop/common/lib/*:/opt/hadoop-3.2.1/share/hadoop/common/*:/opt/hadoop-3.2.1/share/hadoop/hdfs:/opt/hadoop-3.2.1/share/hadoop/hdfs/lib/*:/opt/hadoop-3.2.1/share/hadoop/hdfs/*:/opt/hadoop-3.2.1/share/hadoop/mapreduce/lib/*:/opt/hadoop-3.2.1/share/hadoop/mapreduce/*:/opt/hadoop-3.2.1/share/hadoop/yarn:/opt/hadoop-3.2.1/share/hadoop/yarn/lib/*:/opt/hadoop-3.2.1/share/hadoop/yarn/*
+
+# Install AWS Cli. Use this to test that you can read from AWS with something
+# like `aws s3 ls`
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+RUN unzip awscliv2.zip
+RUN ./aws/install
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..166a9fc
--- /dev/null
+++ b/main.py
@@ -0,0 +1,9 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.getOrCreate()
+dataList = [1, 2, 3, 4, 5]
+rdd = spark.sparkContext.parallelize(dataList)
+rdd2 = rdd.flatMap(lambda x: str(x))
+data = rdd2.collect()
+for x in data:
+ print(type(x), x)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..53129bb
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pyspark==3.2.1