Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 87 additions & 6 deletions metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import ctypes
import datetime
import glob
import logging
import os
import platform
Expand Down Expand Up @@ -29,7 +31,7 @@
from sqlalchemy.types import FLOAT, INTEGER, TIMESTAMP

import datahub.metadata.schema_classes as models
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
from datahub.emitter.mce_builder import (
DEFAULT_ENV,
make_data_job_urn,
Expand Down Expand Up @@ -378,6 +380,70 @@ def normalize_db_name(name: str) -> str:
"""


# Oracle Instant Client shared libs in the order they should be preloaded.
# libclntsh is last because it has DT_NEEDED entries on the others; loading
# the deps first by absolute path puts them in the process namespace by SONAME
# so the linker reuses them when libclntsh is opened.
_ORACLE_PRELOAD_PATTERNS = (
"libnnz*.so*",
"libclntshcore.so*",
"libons.so*",
"libipc1.so*",
"libmql1.so*",
"libociei.so*",
"libclntsh.so*",
)


def _preload_oracle_client_libs(lib_dir: str) -> None:
"""Preload Oracle Instant Client libs from ``lib_dir`` so that
``oracledb.init_oracle_client()`` succeeds on Linux without needing
``LD_LIBRARY_PATH`` or ``ldconfig`` to be configured.

Background: on Linux, Oracle ships ``libclntsh.so`` without
``RUNPATH=$ORIGIN``. Even when python-oracledb / ODPI-C dlopens
``libclntsh.so`` via an absolute path (which is what ``lib_dir`` does),
the dynamic linker still has to resolve its DT_NEEDED dependencies
(``libnnz*.so``, ``libclntshcore.so``, ``libons.so``, ...) through the
normal ``LD_LIBRARY_PATH`` / ``ld.so.cache`` rules. With neither
configured, the load fails with DPI-1047.

Setting ``LD_LIBRARY_PATH`` from Python doesn't help: glibc's loader
reads it once at process startup. Loading each ``.so`` by absolute path
with ``RTLD_GLOBAL`` does work β€” once an object is mapped, the linker
looks it up by SONAME for subsequent ``dlopen()`` calls and finds it.

See https://github.com/oracle/python-oracledb/issues/578 for the upstream
discussion confirming this can only be fixed by preloading from the client
side or by patching ``RUNPATH=$ORIGIN`` into ``libclntsh.so`` itself.
"""
if not os.path.isdir(lib_dir):
raise ConfigurationError(
f"thick_mode_lib_dir={lib_dir!r} does not exist or is not a directory"
)

loaded_any = False
for pattern in _ORACLE_PRELOAD_PATTERNS:
for path in sorted(glob.glob(os.path.join(lib_dir, pattern))):
try:
ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
loaded_any = True
logger.debug("Preloaded Oracle client lib: %s", path)
except OSError as e:
# Non-fatal: a missing satellite lib (e.g. libipc1 in older
# client releases) is fine as long as libclntsh and its actual
# deps load. Keep going so we surface a useful error from
# init_oracle_client() if anything critical is missing.
logger.debug("Skipping %s while preloading: %s", path, e)

if not loaded_any:
raise ConfigurationError(
f"No Oracle Instant Client libraries found in {lib_dir!r}. "
"Verify the path points to an unpacked Instant Client (it should "
"contain libclntsh.so* and libnnz*.so*)."
)


def _setup_oracle_compatibility() -> None:
"""
Set up Oracle compatibility for SQLAlchemy.
Expand Down Expand Up @@ -466,8 +532,13 @@ class OracleConfig(BasicSQLAlchemyConfig, BaseUsageConfig):
)
thick_mode_lib_dir: Optional[str] = Field(
default=None,
description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
"On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
description="Path to the directory containing the Oracle Instant Client libraries. "
"Required on Windows and Mac when enable_thick_mode is true. "
"Optional on Linux: when set, the connector preloads the client libraries "
"from this directory before initializing python-oracledb, which makes "
"thick mode work without needing ldconfig or LD_LIBRARY_PATH to be set "
"(see https://github.com/oracle/python-oracledb/issues/578). When unset "
"on Linux, the standard ldconfig / LD_LIBRARY_PATH search is used.",
)
# Stored procedures configuration
include_stored_procedures: bool = Field(
Expand Down Expand Up @@ -1297,11 +1368,21 @@ def __init__(self, config, ctx):
# create_engine, which is called in get_inspectors()
# https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
if self.config.enable_thick_mode:
if platform.system() == "Darwin" or platform.system() == "Windows":
# windows and mac os require lib_dir to be set explicitly
if platform.system() in ("Darwin", "Windows"):
# Mac/Windows: lib_dir is required and is enough; the platform's
# loader handles the dependent libs.
oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
elif self.config.thick_mode_lib_dir:
# Linux: passing lib_dir to init_oracle_client() locates
# libclntsh.so itself but the loader still falls back to
# LD_LIBRARY_PATH / ld.so.cache for its DT_NEEDED deps, which
# fails on hosts that don't have ldconfig set up. Preload every
# .so in lib_dir by absolute path with RTLD_GLOBAL so the deps
# are resolved by SONAME from the process namespace.
_preload_oracle_client_libs(self.config.thick_mode_lib_dir)
oracledb.init_oracle_client()
else:
# linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
# Linux without thick_mode_lib_dir: rely on ldconfig / LD_LIBRARY_PATH.
oracledb.init_oracle_client()

# Pre-fetch schemas from DataHub when not ingesting all tables/views so that
Expand Down
130 changes: 129 additions & 1 deletion metadata-ingestion/tests/unit/test_oracle_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import unittest.mock
from datetime import datetime
from typing import List, Optional
from unittest.mock import MagicMock, Mock, patch

import pytest
Expand All @@ -9,7 +11,7 @@
from sqlalchemy.engine import Inspector
from sqlalchemy.sql import sqltypes

from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.sql.oracle import (
VSQL_USAGE_QUERY,
Expand All @@ -19,6 +21,7 @@
OracleSource,
ProcedureDependencies,
VSqlPrerequisiteCheckResult,
_preload_oracle_client_libs,
extra_oracle_types,
)
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
Expand Down Expand Up @@ -1423,3 +1426,128 @@ def mock_parent_workunits():
list(source.get_workunits())

assert all(registered_during_iteration)


def _make_thick_mode_config(thick_mode_lib_dir: Optional[str] = None) -> OracleConfig:
return OracleConfig(
username="user",
password="password",
host_port="host:1521",
service_name="svc01",
enable_thick_mode=True,
thick_mode_lib_dir=thick_mode_lib_dir,
)


def test_preload_oracle_client_libs_loads_in_dependency_order(tmp_path):
"""Deps must be opened before libclntsh; missing optional libs are skipped silently."""
# Create stub files for some but not all of the patterns the helper looks for.
# libipc1 / libmql1 / libociei are intentionally absent β€” they are optional in
# newer Instant Client releases and the helper must tolerate that.
files = [
"libnnz12.so",
"libclntshcore.so.21.1",
"libons.so",
"libclntsh.so.21.1",
]
for name in files:
(tmp_path / name).write_bytes(b"")

loaded: List[str] = []

class _FakeCDLL:
def __init__(self, path: str, mode: int) -> None:
loaded.append(os.path.basename(path))

with patch("datahub.ingestion.source.sql.oracle.ctypes.CDLL", _FakeCDLL):
_preload_oracle_client_libs(str(tmp_path))

# Dependencies before libclntsh β€” that's the whole point of the helper.
assert loaded[-1] == "libclntsh.so.21.1"
assert "libnnz12.so" in loaded
assert "libclntshcore.so.21.1" in loaded
assert "libons.so" in loaded
# nnz must precede libclntsh; libclntshcore likewise.
assert loaded.index("libnnz12.so") < loaded.index("libclntsh.so.21.1")
assert loaded.index("libclntshcore.so.21.1") < loaded.index("libclntsh.so.21.1")


def test_preload_oracle_client_libs_raises_when_dir_missing(tmp_path):
bogus = tmp_path / "does-not-exist"

with pytest.raises(ConfigurationError, match="does not exist"):
_preload_oracle_client_libs(str(bogus))


def test_preload_oracle_client_libs_raises_when_dir_empty(tmp_path):
# Directory exists but contains no Oracle libs β€” surfaces a clearer error
# than the cryptic DPI-1047 we'd otherwise get downstream.
with pytest.raises(ConfigurationError, match="No Oracle Instant Client libraries"):
_preload_oracle_client_libs(str(tmp_path))


def test_preload_oracle_client_libs_skips_individual_load_failures(tmp_path):
"""One bad .so should not abort the whole preload β€” log and continue."""
for name in ["libnnz12.so", "libclntsh.so.21.1"]:
(tmp_path / name).write_bytes(b"")

attempts: List[str] = []

class _FlakyCDLL:
def __init__(self, path: str, mode: int) -> None:
attempts.append(os.path.basename(path))
if path.endswith("libnnz12.so"):
raise OSError("synthetic load failure")

with patch("datahub.ingestion.source.sql.oracle.ctypes.CDLL", _FlakyCDLL):
_preload_oracle_client_libs(str(tmp_path))

assert "libnnz12.so" in attempts
assert "libclntsh.so.21.1" in attempts


@patch("datahub.ingestion.source.sql.oracle.platform.system", return_value="Linux")
@patch("datahub.ingestion.source.sql.oracle._preload_oracle_client_libs")
@patch("datahub.ingestion.source.sql.oracle.oracledb")
def test_oracle_source_linux_preloads_when_lib_dir_set(
mock_oracledb, mock_preload, _mock_platform, tmp_path
):
config = _make_thick_mode_config(thick_mode_lib_dir=str(tmp_path))

OracleSource(config, PipelineContext("test-thick-linux-preload"))

mock_preload.assert_called_once_with(str(tmp_path))
# On Linux we must NOT pass lib_dir to init_oracle_client(): the preload
# already put the libs in the process namespace, and passing lib_dir is
# what triggers the DT_NEEDED resolution failure described in oracle/python-oracledb#578.
mock_oracledb.init_oracle_client.assert_called_once_with()


@patch("datahub.ingestion.source.sql.oracle.platform.system", return_value="Linux")
@patch("datahub.ingestion.source.sql.oracle._preload_oracle_client_libs")
@patch("datahub.ingestion.source.sql.oracle.oracledb")
def test_oracle_source_linux_skips_preload_when_lib_dir_unset(
mock_oracledb, mock_preload, _mock_platform
):
config = _make_thick_mode_config(thick_mode_lib_dir=None)

OracleSource(config, PipelineContext("test-thick-linux-no-preload"))

mock_preload.assert_not_called()
mock_oracledb.init_oracle_client.assert_called_once_with()


@patch("datahub.ingestion.source.sql.oracle.platform.system", return_value="Darwin")
@patch("datahub.ingestion.source.sql.oracle._preload_oracle_client_libs")
@patch("datahub.ingestion.source.sql.oracle.oracledb")
def test_oracle_source_mac_passes_lib_dir_without_preload(
mock_oracledb, mock_preload, _mock_platform, tmp_path
):
config = _make_thick_mode_config(thick_mode_lib_dir=str(tmp_path))

OracleSource(config, PipelineContext("test-thick-mac"))

# macOS uses dyld, not glibc's loader, so no preload trick is needed β€”
# init_oracle_client(lib_dir=...) is sufficient.
mock_preload.assert_not_called()
mock_oracledb.init_oracle_client.assert_called_once_with(lib_dir=str(tmp_path))
Loading