Skip to content

Commit b2e2523

Browse files
authored
HIVE-29577: Support prefetching Tez snapshot jars into the Docker image (#6448)
1 parent 0c362e9 commit b2e2523

3 files changed

Lines changed: 70 additions & 5 deletions

File tree

packaging/src/docker/Dockerfile

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,18 +67,22 @@ RUN tar -xzv \
6767
tar -xzv \
6868
--exclude="apache-tez-$TEZ_VERSION-bin/share" \
6969
-f /opt/apache-tez-$TEZ_VERSION-bin.tar.gz \
70-
-C /opt
70+
-C /opt; \
71+
mkdir -p /opt/tez-snapshot;
7172

7273
FROM eclipse-temurin:21-jdk-ubi9-minimal AS run
7374

7475
ARG UID=1000
7576
ARG HADOOP_VERSION
7677
ARG HIVE_VERSION
7778
ARG TEZ_VERSION
79+
ARG TEZ_SNAPSHOT_VERSION=
80+
ARG TEZ_SNAPSHOT_REPO_URL=https://repository.apache.org/content/repositories/snapshots
81+
7882
# Install dependencies
7983
RUN set -ex; \
8084
microdnf update -y; \
81-
microdnf -y install procps gettext; \
85+
microdnf -y install procps gettext wget xmlstarlet; \
8286
microdnf clean all; \
8387
useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive
8488

@@ -93,6 +97,38 @@ ENV PATH=$HIVE_HOME/bin:$HADOOP_HOME/bin:$PATH
9397
COPY --from=env --chown=hive /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME
9498
COPY --from=env --chown=hive /opt/apache-hive-$HIVE_VERSION-bin $HIVE_HOME
9599
COPY --from=env --chown=hive /opt/apache-tez-$TEZ_VERSION-bin $TEZ_HOME
100+
COPY --from=env --chown=hive /opt/tez-snapshot /opt/tez-snapshot
101+
102+
# When TEZ_SNAPSHOT_VERSION is set, fetch Tez snapshot jars from the Maven snapshot repository
103+
# and place them under /opt/tez-snapshot. At runtime, entrypoint.sh symlinks these into
104+
# $HIVE_HOME/lib with a "0-" prefix so they sort first in bin/hive's classpath glob, ensuring
105+
# snapshot classes take precedence over the Tez release jars bundled with Hive.
106+
# Maven snapshot repositories use timestamped filenames (e.g. tez-api-1.0.0-20250101.jar),
107+
# so we fetch maven-metadata.xml first to resolve the exact filename before downloading the jar.
108+
RUN set -eux; \
109+
mkdir -p /opt/tez-snapshot-download; \
110+
if [[ -n "${TEZ_SNAPSHOT_VERSION}" ]]; then \
111+
base_url="${TEZ_SNAPSHOT_REPO_URL}/org/apache/tez"; \
112+
for artifact in tez-common tez-api tez-dag tez-mapreduce tez-runtime-internals tez-runtime-library; do \
113+
version_url="${base_url}/${artifact}/${TEZ_SNAPSHOT_VERSION}"; \
114+
metadata_url="${version_url}/maven-metadata.xml"; \
115+
metadata_file="/opt/tez-snapshot-download/${artifact}-maven-metadata.xml"; \
116+
echo "metadata_url=${metadata_url}"; \
117+
wget -q "${metadata_url}" -O "${metadata_file}"; \
118+
snapshot_value="$(xmlstarlet sel -t -v "string(/metadata/versioning/snapshotVersions/snapshotVersion[extension='jar' and not(classifier)]/value)" "${metadata_file}")"; \
119+
test -n "${snapshot_value}"; \
120+
jar_file="${artifact}-${snapshot_value}.jar"; \
121+
jar_url="${version_url}/${jar_file}"; \
122+
echo "jar_url=${jar_url}"; \
123+
wget -q "${jar_url}" -O "/opt/tez-snapshot/${jar_file}"; \
124+
done; \
125+
echo "Downloaded Tez snapshot jars under /opt/tez-snapshot:"; \
126+
ls -1 /opt/tez-snapshot/*.jar; \
127+
else \
128+
echo "TEZ_SNAPSHOT_VERSION not set. Skipping Tez snapshot download."; \
129+
fi; \
130+
rm -rf /opt/tez-snapshot-download
131+
96132

97133
COPY --chown=hive entrypoint.sh /
98134
COPY --chown=hive conf $HIVE_HOME/conf
@@ -103,7 +139,8 @@ RUN chmod +x /entrypoint.sh && \
103139
mkdir -p $HIVE_HOME/scratch && \
104140
chown hive $HIVE_HOME/scratch && \
105141
mkdir -p /home/hive/.beeline && \
106-
chown hive /home/hive/.beeline
142+
chown hive /home/hive/.beeline && \
143+
chown -R hive /opt/tez-snapshot
107144

108145
USER hive
109146
WORKDIR $HIVE_HOME

packaging/src/docker/build.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ set -eux
2020
HIVE_VERSION=
2121
HADOOP_VERSION=
2222
TEZ_VERSION=
23+
TEZ_SNAPSHOT_VERSION=
2324
usage() {
2425
cat <<EOF 1>&2
25-
Usage: $0 [-h] [-hadoop <Hadoop version>] -tez <Tez release version> [-hive <Hive version>] [-repo <Docker repo>]
26+
Usage: $0 [-h] [-hadoop <Hadoop version>] -tez <Tez release version> [-tez-snapshot [<Maven snapshot version>]] [-hive <Hive version>] [-repo <Docker repo>]
2627
Build the Hive Docker image (reused for LLAP too)
2728
-help Display help
2829
-hadoop Build image with the specified Hadoop version (default: from Maven pom)
2930
-tez Required. Tez release tarball version (apache-tez-\$TEZ_VERSION-bin.tar.gz from archive)
31+
-tez-snapshot <ver> Optional. When a snapshot version is given, fetch Tez Maven snapshot jars into the image. With no version, snapshot prefetch is skipped.
3032
-hive Build image with the specified Hive version
3133
-repo Docker repository
3234
EOF
@@ -48,6 +50,13 @@ while [ $# -gt 0 ]; do
4850
TEZ_VERSION=$1
4951
shift
5052
;;
53+
-tez-snapshot)
54+
shift
55+
if [ $# -gt 0 ] && [[ "$1" != -* ]]; then
56+
TEZ_SNAPSHOT_VERSION=$1
57+
shift
58+
fi
59+
;;
5160
-hive)
5261
shift
5362
HIVE_VERSION=$1
@@ -135,6 +144,9 @@ DOCKER_BUILD_ARGS=(
135144
--build-arg "HADOOP_VERSION=$HADOOP_VERSION"
136145
--build-arg "TEZ_VERSION=$TEZ_VERSION"
137146
)
147+
if [ -n "$TEZ_SNAPSHOT_VERSION" ]; then
148+
DOCKER_BUILD_ARGS+=(--build-arg "TEZ_SNAPSHOT_VERSION=$TEZ_SNAPSHOT_VERSION")
149+
fi
138150

139151
docker build \
140152
"$WORK_DIR" \

packaging/src/docker/entrypoint.sh

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,13 @@ function run_tezam {
147147
export HIVE_HOME="${HIVE_HOME:-/opt/hive}"
148148
export HADOOP_CONF_DIR="${HADOOP_CONF_DIR:-$HIVE_CONF_DIR}"
149149
export TEZ_CONF_DIR="${TEZ_CONF_DIR:-$HADOOP_CONF_DIR}"
150+
: "${TEZ_SNAPSHOT_HOME:=/opt/tez-snapshot}"
151+
if [[ ! -d "${TEZ_SNAPSHOT_HOME}" ]]; then
152+
echo "Tez snapshot home not found at ${TEZ_SNAPSHOT_HOME}. Rebuild image to prefetch snapshot artifacts."
153+
exit 1
154+
fi
150155
# service_plugins_descriptor.json references org.apache.hadoop.hive.llap.tezplugins.* (hive-llap-tez, etc.)
151-
tezam_cp="${HADOOP_CONF_DIR}:${TEZ_CONF_DIR}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*:${HIVE_HOME}/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_CLASSPATH:-}"
156+
tezam_cp="${HADOOP_CONF_DIR}:${TEZ_CONF_DIR}:${TEZ_SNAPSHOT_HOME}/*:${TEZ_HOME}/*:${TEZ_HOME}/lib/*:${HIVE_HOME}/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_CLASSPATH:-}"
152157

153158
local java_bin
154159
local tezam_java_opts
@@ -178,6 +183,17 @@ if [[ "${SKIP_SCHEMA_INIT}" == "false" && ( "${SERVICE_NAME}" == "hiveserver2" |
178183
fi
179184

180185
if [ "${SERVICE_NAME}" == "hiveserver2" ]; then
186+
TEZ_SNAPSHOT_HOME="${TEZ_SNAPSHOT_HOME:-/opt/tez-snapshot}"
187+
# bin/hive prepends all of $HIVE_HOME/lib/*.jar to $HADOOP_CLASSPATH, so any entry
188+
# we put first in HADOOP_CLASSPATH ends up after all Hive lib jars. To get snapshot jars
189+
# truly first, symlink them into $HIVE_HOME/lib/ with a "0-" prefix: the for-loop glob in
190+
# bin/hive processes jars alphabetically, and ASCII '0' (48) sorts before 'a' (97), so these
191+
# symlinks become the very first jars on the classpath (right after the conf dir entries).
192+
if ls "${TEZ_SNAPSHOT_HOME}"/*.jar 1>/dev/null 2>&1; then
193+
for snap_jar in "${TEZ_SNAPSHOT_HOME}"/*.jar; do
194+
ln -sf "$snap_jar" "${HIVE_HOME}/lib/0-$(basename "$snap_jar")"
195+
done
196+
fi
181197
export HADOOP_CLASSPATH="$TEZ_HOME/*:$TEZ_HOME/lib/*:$HADOOP_CLASSPATH"
182198
exec "$HIVE_HOME/bin/hive" --skiphadoopversion --skiphbasecp --service "$SERVICE_NAME"
183199
elif [ "${SERVICE_NAME}" == "metastore" ]; then

0 commit comments

Comments
 (0)