Cannot write to output directory within pipeline in machine learning studio

Question

Cannot write to output directory within pipeline in machine learning studio

Indeed 0

I have written this code solely to demonstrate the error I need fixed, as I cannot show the actual code that serves as the context for it, as it is in the private domain.

I have the following code:


import os
import sys

# Install a package (e.g., requests)
os.system(f'{sys.executable} -m pip install "environs==14.2.0" "pandas==2.3.0" "psycopg2-binary>=2.9.9,<2.9.11" "pyarrow==20.0.0" "sqlalchemy==2.0.41" scikit_learn==1.6.1')
os.system(f'{sys.executable} -m pip install "h2o==3.46.0.7" "lightgbm==4.6.0"  "pandas==2.3.0" "azure-identity==1.23.0" "azure-keyvault-secrets==4.10.0" ')
os.system(f'{sys.executable} -m pip install "boto3==1.39.2" "fsspec==2025.5.1" "s3fs==0.4.2"')
os.system(f'{sys.executable} -m pip install install-jdk')


import jdk

jdk.install('17')


java_path = shutil.which("java")

if java_path:
    print(f"Java found at: {java_path}")

    # Step 2: Infer JAVA_HOME by going two levels up from the binary
    # Example: /usr/lib/jvm/java-11-openjdk-amd64/bin/java → /usr/lib/jvm/java-11-openjdk-amd64
    java_home = os.path.dirname(os.path.dirname(java_path))
    print(f"Inferred JAVA_HOME: {java_home}")

    # Step 3: Set JAVA_HOME and update PATH
    os.environ["JAVA_HOME"] = java_home
    os.environ["PATH"] = os.path.join(java_home, "bin") + os.pathsep + os.environ["PATH"]

    # Optional: Confirm it's working
    print("JAVA_HOME set to:", os.environ["JAVA_HOME"])
    print("Updated PATH:", os.environ["PATH"])
else:
    print("Java not found in PATH. Please install Java or ensure it's accessible.")



import h2o

h2o.init()

This code is run as the single step as the file 'churncommunity3.py' in a pipeline that is created with the following code:

%pip install azureml-sdk azureml.core azureml azureml.pipeline

import azureml.core
from azureml.core import Workspace, Datastore
from azureml.data import OutputFileDatasetConfig
ws = Workspace.from_config()
#def_data_store = ws.get_default_datastore()

# Get the blob storage associated with the workspace
#def_blob_store = Datastore(ws, "workspaceblobstore")

# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspaceworkingdirectory")


from azureml.core.compute import ComputeTarget, AmlCompute

compute_name = "Compute32"
vm_size = "Standard_D8d_v4"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
                                                                min_nodes=0,
                                                                max_nodes=4)
    # create the compute target
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())



from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment 

aml_run_config = RunConfiguration()
# `compute_target` as defined in "Azure Machine Learning compute" section above
aml_run_config.target = compute_target

USE_CURATED_ENV = True
if USE_CURATED_ENV :
    print("hello")
    curated_environment = Environment.get(workspace=ws, name="pythonfinal")
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False
    











    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(python_version="3.13.2",
        conda_packages=['pip=22.3.1', 'pandas','scikit-learn'], 
        pip_packages=None,
        #pip_packages=['azureml-sdk', 'azureml-dataset-runtime[fuse,pandas]'], 
        pin_sdk_version=False)

from azureml.pipeline.steps import PythonScriptStep
dataprep_source_dir = "."
entry_point = "churncommunity3.py"
# `my_dataset` as defined above
#ds_input = my_dataset.as_named_input('input1')

#source='/mnt/batch/tasks/shared/LS_root/mounts/clusters/compute32/code/Users/William.Balthes/churnmodel/Users/William.Balthes/churnmodel/churnfiles/',
output_data1 = OutputFileDatasetConfig(destination = (def_file_store, 'Users/William.Balthes/churnmodel'))
# `output_data1`, `compute_target`, `aml_run_config` as defined above
data_prep_step = PythonScriptStep(
    script_name=entry_point,
    source_directory=dataprep_source_dir,
    arguments=[ "--output", output_data1],
    compute_target=compute_target,
    runconfig=aml_run_config,
    allow_reuse=True
) 
from azureml.pipeline.core import Pipeline
compare_models = [data_prep_step]
pipeline1 = Pipeline(workspace=ws, steps=[compare_models],default_datastore=def_file_store)
pipeline1.publish("churnmodels30")
from azureml.core import Experiment
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
# Submit the pipeline to be run
#pipeline_run1 = Experiment(ws, 'Churn_models').submit(pipeline1)
#pipeline_run1.wait_for_completion()
import azureml.core
from azureml.core import Workspace
from azureml.pipeline.core import Pipeline, PublishedPipeline
from azureml.core.experiment import Experiment
published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in  published_pipelines:
    print(f"{published_pipeline.name},'{published_pipeline.id}'")


from azureml.pipeline.core import PublishedPipeline, PipelineEndpoint
import datetime
# Retrieve a published pipeline
published_pipeline = PublishedPipeline.get(workspace=ws, id='3abaebbd-91b1-4138-967f-f3325d7553f1')

# Create a pipeline endpoint
#pipeline_endpoint = PipelineEndpoint.publish(
#workspace=ws,
#name="Churnendpoint6",
#pipeline=published_pipeline,
#description="churn"
#)





import azureml.core
from azureml.core import Workspace
from azureml.pipeline.core import Pipeline, PublishedPipeline
from azureml.core.experiment import Experiment

from azureml.pipeline.core import Schedule, ScheduleRecurrence, TimeZone

ws = Workspace.from_config()

experiments = Experiment.list(ws)
for experiment in experiments:
    print(experiment.name)

published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in  published_pipelines:
    print(f"{published_pipeline.name},'{published_pipeline.id}'")

experiment_name = "churnscheduling23" 
pipeline_id = "3abaebbd-91b1-4138-967f-f3325d7553f1"
start_time = datetime.datetime(year=2025, month=8, day=13, hour=18,minute=33)
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
recurrence = ScheduleRecurrence(frequency="Day", interval=1,start_time=start_time,time_zone=TimeZone.AUSEasternStandardTime)
recurring_schedule = Schedule.create(ws, name="MyRecurringSchedule3", 
                            description="Based on time",
                            pipeline_id=pipeline_id, 
                            experiment_name=experiment_name, 
                            recurrence=recurrence)

I use the environment :mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
with this yaml file:

channels:
  - conda-forge
  - defaults
dependencies:
  - python=3.13.2
  - pip=25.1

The logs which I have attached show that h2o (a machine learning library) still cannot detect java despite already installing it in the code with 'jdk.install('17').

The logs also indicate that the code immediately after the installation which is designed to detect the java binaries and set the environment variable, also cannot do this as it cannot detect the java binaries.

What can be done to rectify this situation so that h2o can detect java and then?

I know it has something to do with it being run as a pipeline because when I use the same compute, also with python 3.13.2 (the same as in the pipeline version), on a notebook, h2o detects java when h2o.init() is run . In fact, I don't even have to install java in this case, as it appears to be come pre-installed with the environment variables already set up.

Share via

Cannot write to output directory within pipeline in machine learning studio

Your answer