From af3a6dacc236b7e15e3a8a6d264c2ca1fab85cab Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Mon, 6 Jan 2025 23:49:12 +0700 Subject: [PATCH 1/3] Improve perfomance of 'reverse' function Signed-off-by: Tai Le Manh --- datafusion/functions/Cargo.toml | 5 ++ datafusion/functions/benches/reverse.rs | 90 +++++++++++++++++++++ datafusion/functions/src/unicode/reverse.rs | 22 +++-- 3 files changed, 110 insertions(+), 7 deletions(-) create mode 100644 datafusion/functions/benches/reverse.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index fd986c4be41cc..c8025fb2d895d 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -204,6 +204,11 @@ harness = false name = "strpos" required-features = ["unicode_expressions"] +[[bench]] +harness = false +name = "reverse" +required-features = ["unicode_expressions"] + [[bench]] harness = false name = "trunc" diff --git a/datafusion/functions/benches/reverse.rs b/datafusion/functions/benches/reverse.rs new file mode 100644 index 0000000000000..c7c1ef8a82209 --- /dev/null +++ b/datafusion/functions/benches/reverse.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::OffsetSizeTrait; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode; +use std::sync::Arc; + +fn create_args( + size: usize, + str_len: usize, + force_view_types: bool, +) -> Vec { + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + + vec![ColumnarValue::Array(string_array)] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ColumnarValue::Array(string_array)] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let reverse = unicode::reverse(); + for size in [1024, 4096] { + let str_len = 8; + + let args = create_args::(size, str_len, true); + c.bench_function( + format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + + let str_len = 32; + + let args = create_args::(size, str_len, true); + c.bench_function( + format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + + let args = create_args::(size, str_len, false); + c.bench_function( + format!("reverse_string [size={}, str_len={}]", size, str_len).as_str(), + |b| { + b.iter(|| { + // TODO use invoke_with_args + black_box(reverse.invoke_batch(&args, str_len)) + }) + }, + ); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 5ad347ed96c08..ec6f7a453e7a4 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -20,8 +20,7 @@ use std::sync::Arc; use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, - OffsetSizeTrait, + Array, ArrayRef, AsArray, GenericStringBuilder, OffsetSizeTrait, StringArrayType, }; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; @@ -116,14 +115,23 @@ pub fn reverse(args: &[ArrayRef]) -> Result { } } -fn reverse_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( +fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( string_array: V, ) -> Result { - let result = ArrayIter::new(string_array) - .map(|string| string.map(|string: &str| string.chars().rev().collect::())) - .collect::>(); + let mut builder: GenericStringBuilder = + GenericStringBuilder::with_capacity(string_array.len(), 1024); + + for string in string_array.iter() { + if let Some(s) = string { + let mut reversed = String::with_capacity(s.len()); + reversed.extend(s.chars().rev()); + builder.append_value(reversed); + } else { + builder.append_null(); + } + } - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } #[cfg(test)] From f00aebb30a82d2a109db06dc03d13d9d5ad38d6e Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Tue, 7 Jan 2025 14:27:28 +0700 Subject: [PATCH 2/3] Apply sugestion change --- datafusion/functions/src/unicode/reverse.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index ec6f7a453e7a4..0ac86801f7d02 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -118,14 +118,14 @@ pub fn reverse(args: &[ArrayRef]) -> Result { fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( string_array: V, ) -> Result { - let mut builder: GenericStringBuilder = - GenericStringBuilder::with_capacity(string_array.len(), 1024); + let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), 1024); + let mut reversed = String::new(); for string in string_array.iter() { if let Some(s) = string { - let mut reversed = String::with_capacity(s.len()); reversed.extend(s.chars().rev()); - builder.append_value(reversed); + builder.append_value(&reversed); + reversed.clear(); } else { builder.append_null(); } From 92e74a527f74a1ea7304114530fa709cc2f7d86a Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Thu, 9 Jan 2025 00:30:13 +0700 Subject: [PATCH 3/3] Fix typo --- datafusion/functions/src/unicode/reverse.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 0ac86801f7d02..f07deda70e52e 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -104,8 +104,7 @@ impl ScalarUDFImpl for ReverseFunc { } } -/// Reverses the order of the characters in the string. -/// reverse('abcde') = 'edcba' +/// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`. /// The implementation uses UTF-8 code points as characters pub fn reverse(args: &[ArrayRef]) -> Result { if args[0].data_type() == &Utf8View {