diff options
-rw-r--r-- | .gitignore | 12 | ||||
-rw-r--r-- | Makefile | 19 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | docker-compose.yml | 13 | ||||
-rw-r--r-- | docker/Dockerfile | 60 | ||||
-rw-r--r-- | main.py | 9 | ||||
-rw-r--r-- | requirements.txt | 1 |
7 files changed, 118 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..696c85c --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +.*.swp +*.pyc +*.pyo +.DS_Store +tags +.ropeproject +*.actual +.vimcache +.idea +.mypy_cache +.envrc +*.sqlite diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4a793ab --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +.PHONY: build + +help: + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + +build: ## Build the Docker image + docker-compose -p docker-spark build + +up: build ## Bring the container up + docker-compose -p docker-spark up -d + +down: ## Stop the container + docker-compose -p docker-spark stop + +enter: ## Enter the running container + docker-compose -p docker-spark exec backend /bin/bash + +clean: down ## Remove stoped containers + docker-compose -p docker-spark rm -f diff --git a/README.md b/README.md new file mode 100644 index 0000000..be6b017 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# Docker Spark + +Mostly used to figure out how to install spark into a container (after spending so long doing it). +You can build, enter and run `python main.py` to see the example run. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..66d2557 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3' +services: + backend: + build: + context: ./ + dockerfile: ./docker/Dockerfile + container_name: docker-spark + image: thornycrackers/docker-spark + volumes: + - .:/usr/src/app + command: /bin/bash + tty: true + stdin_open: true diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..02103dc --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,60 @@ +FROM python:3.9-bullseye + +# Set working directory +WORKDIR /usr/src/app + +# software-properties-common so we can use add-apt-repository +RUN apt-get update && \ + apt-get install --no-install-recommends -y \ + software-properties-common \ + && rm -rf /var/lib/apt/lists/* + +# Java 8 install +RUN wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - +RUN add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ +RUN apt-get update && \ + apt-get install --no-install-recommends -y \ + adoptopenjdk-8-hotspot \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/adoptopenjdk-8-hotspot-amd64/ + +# Install python dependencies +RUN pip install --upgrade pip wheel +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir --prefer-binary -r requirements.txt + +# Install Spark +RUN wget https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-without-hadoop.tgz +RUN tar zxvf spark-3.1.2-bin-without-hadoop.tgz +RUN mv spark-3.1.2-bin-without-hadoop /opt/ +ENV SPARK_HOME=/opt/spark-3.1.2-bin-without-hadoop +ENV PATH=$SPARK_HOME/bin:$PATH + +# You may ask: But you just downloaded spark without hadoop? Why are you +# installing hadoop? https://spark.apache.org/docs/latest/hadoop-provided.html +# Spark uses Hadoop client libraries for HDFS and YARN. + +# Get Hadoop +RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz +RUN tar zxvf hadoop-3.2.1.tar.gz +RUN mv hadoop-3.2.1 /opt/ +# Hadoop variables https://phoenixnap.com/kb/install-hadoop-ubuntu +ENV HADOOP_HOME=/opt/hadoop-3.2.1 +ENV PATH=$PATH:$HADOOP_HOME/bin +ENV PATH=$PATH:$HADOOP_HOME/sbin +ENV HADOOP_MAPRED_HOME=$HADOOP_HOME +ENV HADOOP_COMMON_HOME=$HADOOP_HOME +ENV HADOOP_HDFS_HOME=$HADOOP_HOME +ENV YARN_HOME=$HADOOP_HOME +ENV HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native" + +# SPARK_DIST_CLASSPATH=$(hadoop classpath) +# Docker doesn't have a great was to export environment variables dynamically +# so if you change the version of hadoop this will need to be updated as well. +ENV SPARK_DIST_CLASSPATH=/opt/hadoop-3.2.1/etc/hadoop:/opt/hadoop-3.2.1/share/hadoop/common/lib/*:/opt/hadoop-3.2.1/share/hadoop/common/*:/opt/hadoop-3.2.1/share/hadoop/hdfs:/opt/hadoop-3.2.1/share/hadoop/hdfs/lib/*:/opt/hadoop-3.2.1/share/hadoop/hdfs/*:/opt/hadoop-3.2.1/share/hadoop/mapreduce/lib/*:/opt/hadoop-3.2.1/share/hadoop/mapreduce/*:/opt/hadoop-3.2.1/share/hadoop/yarn:/opt/hadoop-3.2.1/share/hadoop/yarn/lib/*:/opt/hadoop-3.2.1/share/hadoop/yarn/* + +# Install AWS Cli. Use this to test that you can read from AWS with something +# like `aws s3 ls` +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +RUN unzip awscliv2.zip +RUN ./aws/install @@ -0,0 +1,9 @@ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +dataList = [1, 2, 3, 4, 5] +rdd = spark.sparkContext.parallelize(dataList) +rdd2 = rdd.flatMap(lambda x: str(x)) +data = rdd2.collect() +for x in data: + print(type(x), x) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..53129bb --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pyspark==3.2.1 |