Cannot write to output directory within pipeline in machine learning studio
I have written this code solely to demonstrate the error I need fixed, as I cannot show the actual code that serves as the context for it, as it is in the private domain.
I have the following code:
import os
import sys
# Install a package (e.g., requests)
os.system(f'{sys.executable} -m pip install "environs==14.2.0" "pandas==2.3.0" "psycopg2-binary>=2.9.9,<2.9.11" "pyarrow==20.0.0" "sqlalchemy==2.0.41" scikit_learn==1.6.1')
os.system(f'{sys.executable} -m pip install "h2o==3.46.0.7" "lightgbm==4.6.0" "pandas==2.3.0" "azure-identity==1.23.0" "azure-keyvault-secrets==4.10.0" ')
os.system(f'{sys.executable} -m pip install "boto3==1.39.2" "fsspec==2025.5.1" "s3fs==0.4.2"')
os.system(f'{sys.executable} -m pip install install-jdk')
import jdk
jdk.install('17')
java_path = shutil.which("java")
if java_path:
print(f"Java found at: {java_path}")
# Step 2: Infer JAVA_HOME by going two levels up from the binary
# Example: /usr/lib/jvm/java-11-openjdk-amd64/bin/java → /usr/lib/jvm/java-11-openjdk-amd64
java_home = os.path.dirname(os.path.dirname(java_path))
print(f"Inferred JAVA_HOME: {java_home}")
# Step 3: Set JAVA_HOME and update PATH
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.path.join(java_home, "bin") + os.pathsep + os.environ["PATH"]
# Optional: Confirm it's working
print("JAVA_HOME set to:", os.environ["JAVA_HOME"])
print("Updated PATH:", os.environ["PATH"])
else:
print("Java not found in PATH. Please install Java or ensure it's accessible.")
import h2o
h2o.init()
This code is run as the single step as the file 'churncommunity3.py' in a pipeline that is created with the following code:
%pip install azureml-sdk azureml.core azureml azureml.pipeline
import azureml.core
from azureml.core import Workspace, Datastore
from azureml.data import OutputFileDatasetConfig
ws = Workspace.from_config()
#def_data_store = ws.get_default_datastore()
# Get the blob storage associated with the workspace
#def_blob_store = Datastore(ws, "workspaceblobstore")
# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspaceworkingdirectory")
from azureml.core.compute import ComputeTarget, AmlCompute
compute_name = "Compute32"
vm_size = "Standard_D8d_v4"
if compute_name in ws.compute_targets:
compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
print('Found compute target: ' + compute_name)
else:
print('Creating a new compute target...')
provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size, # STANDARD_NC6 is GPU-enabled
min_nodes=0,
max_nodes=4)
# create the compute target
compute_target = ComputeTarget.create(
ws, compute_name, provisioning_config)
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min node count is provided it will use the scale settings for the cluster
compute_target.wait_for_completion(
show_output=True, min_node_count=None, timeout_in_minutes=20)
# For a more detailed view of current cluster status, use the 'status' property
print(compute_target.status.serialize())
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment
aml_run_config = RunConfiguration()
# `compute_target` as defined in "Azure Machine Learning compute" section above
aml_run_config.target = compute_target
USE_CURATED_ENV = True
if USE_CURATED_ENV :
print("hello")
curated_environment = Environment.get(workspace=ws, name="pythonfinal")
aml_run_config.environment = curated_environment
else:
aml_run_config.environment.python.user_managed_dependencies = False
# Add some packages relied on by data prep step
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(python_version="3.13.2",
conda_packages=['pip=22.3.1', 'pandas','scikit-learn'],
pip_packages=None,
#pip_packages=['azureml-sdk', 'azureml-dataset-runtime[fuse,pandas]'],
pin_sdk_version=False)
from azureml.pipeline.steps import PythonScriptStep
dataprep_source_dir = "."
entry_point = "churncommunity3.py"
# `my_dataset` as defined above
#ds_input = my_dataset.as_named_input('input1')
#source='/mnt/batch/tasks/shared/LS_root/mounts/clusters/compute32/code/Users/William.Balthes/churnmodel/Users/William.Balthes/churnmodel/churnfiles/',
output_data1 = OutputFileDatasetConfig(destination = (def_file_store, 'Users/William.Balthes/churnmodel'))
# `output_data1`, `compute_target`, `aml_run_config` as defined above
data_prep_step = PythonScriptStep(
script_name=entry_point,
source_directory=dataprep_source_dir,
arguments=[ "--output", output_data1],
compute_target=compute_target,
runconfig=aml_run_config,
allow_reuse=True
)
from azureml.pipeline.core import Pipeline
compare_models = [data_prep_step]
pipeline1 = Pipeline(workspace=ws, steps=[compare_models],default_datastore=def_file_store)
pipeline1.publish("churnmodels30")
from azureml.core import Experiment
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
# Submit the pipeline to be run
#pipeline_run1 = Experiment(ws, 'Churn_models').submit(pipeline1)
#pipeline_run1.wait_for_completion()
import azureml.core
from azureml.core import Workspace
from azureml.pipeline.core import Pipeline, PublishedPipeline
from azureml.core.experiment import Experiment
published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in published_pipelines:
print(f"{published_pipeline.name},'{published_pipeline.id}'")
from azureml.pipeline.core import PublishedPipeline, PipelineEndpoint
import datetime
# Retrieve a published pipeline
published_pipeline = PublishedPipeline.get(workspace=ws, id='3abaebbd-91b1-4138-967f-f3325d7553f1')
# Create a pipeline endpoint
#pipeline_endpoint = PipelineEndpoint.publish(
#workspace=ws,
#name="Churnendpoint6",
#pipeline=published_pipeline,
#description="churn"
#)
import azureml.core
from azureml.core import Workspace
from azureml.pipeline.core import Pipeline, PublishedPipeline
from azureml.core.experiment import Experiment
from azureml.pipeline.core import Schedule, ScheduleRecurrence, TimeZone
ws = Workspace.from_config()
experiments = Experiment.list(ws)
for experiment in experiments:
print(experiment.name)
published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in published_pipelines:
print(f"{published_pipeline.name},'{published_pipeline.id}'")
experiment_name = "churnscheduling23"
pipeline_id = "3abaebbd-91b1-4138-967f-f3325d7553f1"
start_time = datetime.datetime(year=2025, month=8, day=13, hour=18,minute=33)
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
recurrence = ScheduleRecurrence(frequency="Day", interval=1,start_time=start_time,time_zone=TimeZone.AUSEasternStandardTime)
recurring_schedule = Schedule.create(ws, name="MyRecurringSchedule3",
description="Based on time",
pipeline_id=pipeline_id,
experiment_name=experiment_name,
recurrence=recurrence)
I use the environment :mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
with this yaml file:
channels:
- conda-forge
- defaults
dependencies:
- python=3.13.2
- pip=25.1
The logs which I have attached show that h2o (a machine learning library) still cannot detect java despite already installing it in the code with 'jdk.install('17').
The logs also indicate that the code immediately after the installation which is designed to detect the java binaries and set the environment variable, also cannot do this as it cannot detect the java binaries.
What can be done to rectify this situation so that h2o can detect java and then?
I know it has something to do with it being run as a pipeline because when I use the same compute, also with python 3.13.2 (the same as in the pipeline version), on a notebook, h2o detects java when h2o.init() is run . In fact, I don't even have to install java in this case, as it appears to be come pre-installed with the environment variables already set up.