diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h
index 2dd9149..085aa75 100644
--- a/onnxruntime/core/providers/acl/math/gemm.h
+++ b/onnxruntime/core/providers/acl/math/gemm.h
@@ -7,6 +7,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
+#include "core/providers/cpu/math/gemm_helper.h"
 #include "core/providers/cpu/math/gemm.h"
 #include "core/providers/acl/acl_execution_provider.h"
 
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index 1aed263..4e936a2 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -216,7 +216,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
                                                                                       1 /* depth multiplier */,
                                                                                       arm_compute::Size2D(aclDilation0, dilations[0]));
 #elif defined(ACL_2002)
-      bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayerOptimized::validate(tconv.in->info(),
+      bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
                                                                            tconv.k->info(),
                                                                            (B != nullptr) ? tconv.b->info() : nullptr,
                                                                            tconv.out->info(),
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index 69d272c..a0f3191 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -60,8 +60,9 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
     tpool.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape(), PREF_DIM), arm_compute::Format::F32));
     tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
 
+    const arm_compute::DataLayout data_layout = tpool.in->info()->data_layout();
     if (pool_attrs.global_pooling) {
-      layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
+      layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type, data_layout));
     } else {
       std::vector<int64_t> aclStrides(2);
       aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
@@ -104,7 +105,7 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
       LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
 
-      arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
+      arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, data_layout, aclPadStride, excludePadding);
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc
index a748997..e4b74dd 100644
--- a/onnxruntime/core/providers/acl/tensor/concat.cc
+++ b/onnxruntime/core/providers/acl/tensor/concat.cc
@@ -65,6 +65,7 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
 
   arm_compute::Tensor output;
   std::vector<arm_compute::ITensor*> inputs_vector;
+  std::vector<const arm_compute::ITensor*> const_inputs_vector;
   for (int i = 0; i < input_count; i++) {
     arm_compute::Tensor* input = new arm_compute::Tensor();
     auto X = input_tensors[i];
@@ -72,10 +73,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     input->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape(), PREF_DIM), arm_compute::Format::F32));
 
     inputs_vector.push_back(input);
+    const_inputs_vector.push_back(input);
   }
 
   arm_compute::NEConcatenateLayer layer;
-  layer.configure(inputs_vector, &output, 3 - axis_);
+  layer.configure(const_inputs_vector, &output, 3 - axis_);
 
   LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
   LOGS_DEFAULT(VERBOSE) << std::endl;
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 6168a96..bd39a39 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -410,7 +410,7 @@ def parse_arguments():
         help="Enable Link Time Optimization")
     parser.add_argument(
         "--use_acl", nargs="?", const="ACL_1905",
-        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"],
+        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2008"],
         help="Build with ACL for ARM architectures.")
     parser.add_argument(
         "--acl_home", help="Path to ACL home dir")