duckdb · Mytherin · Apr 24, 2020 · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -347,7 +347,7 @@ matrix:
       script:
         - python scripts/amalgamation.py > /dev/null
         - rsync -a -e "ssh $SSHFLAGS -p 2222" --exclude=.git --exclude=build --exclude=third_party/sqllogictest --exclude=third_party/imdb .  root@localhost:/duckdb
-        - $SCMD 'rm -rf /duckdb/build && mkdir -p /duckdb/build && cd /duckdb/build && export PATH=/opt/csw/bin/:$PATH CXX=g++ CC=gcc  && cmake -DCMAKE_AR=/opt/csw/bin/gar -DCMAKE_BUILD_TYPE=Debug -DAMALGAMATION_BUILD=1 .. && gmake -j2'
+        - travis_wait 30 $SCMD 'rm -rf /duckdb/build && mkdir -p /duckdb/build && cd /duckdb/build && export PATH=/opt/csw/bin/:$PATH CXX=g++ CC=gcc  && cmake -DCMAKE_AR=/opt/csw/bin/gar -DCMAKE_BUILD_TYPE=Debug -DAMALGAMATION_BUILD=1 .. && gmake -j2'
         - $SCMD /duckdb/build/test/unittest "~[copy]~[file_system]~[.]"
 
 

diff --git a/src/common/types.cpp b/src/common/types.cpp
@@ -159,13 +159,15 @@ void SQLType::Serialize(Serializer &serializer) {
 	serializer.Write(id);
 	serializer.Write(width);
 	serializer.Write(scale);
+	serializer.Write<CollationType>(collation);
 }
 
 SQLType SQLType::Deserialize(Deserializer &source) {
 	auto id = source.Read<SQLTypeId>();
 	auto width = source.Read<uint16_t>();
 	auto scale = source.Read<uint8_t>();
-	return SQLType(id, width, scale);
+	auto collation = source.Read<CollationType>();
+	return SQLType(id, width, scale, collation);
 }
 
 string SQLTypeIdToString(SQLTypeId id) {
@@ -344,13 +346,44 @@ SQLType MaxSQLType(SQLType left, SQLType right) {
 		return right;
 	} else if (right.id < left.id) {
 		return left;
-	} else if (left.width > right.width) {
+	} else if (left.width > right.width || left.collation > right.collation) {
 		return left;
 	} else {
 		return right;
 	}
 }
 
+CollationType ParseCollation(string collation_argument, CollationType collation) {
+	if (collation_argument == "nocase") {
+		switch(collation) {
+		case CollationType::COLLATE_DEFAULT:
+			return CollationType::COLLATE_NOCASE;
+		case CollationType::COLLATE_NOACCENT:
+			return CollationType::COLLATE_NOCASE_NOACCENT;
+		default:
+			throw ParserException("Unexpected NOCASE collation!");
+		}
+	} else if (collation_argument == "noaccent") {
+		switch(collation) {
+		case CollationType::COLLATE_DEFAULT:
+			return CollationType::COLLATE_NOACCENT;
+		case CollationType::COLLATE_NOCASE:
+			return CollationType::COLLATE_NOCASE_NOACCENT;
+		default:
+			throw ParserException("Unexpected NOACCENT collation!");
+		}
+	} else if (collation_argument == "binary" || collation_argument == "c" || collation_argument == "posix") {
+		switch(collation) {
+		case CollationType::COLLATE_DEFAULT:
+			return CollationType::COLLATE_NONE;
+		default:
+			throw ParserException("Unexpected BINARY collation!");
+		}
+	} else {
+		throw ParserException("Unsupported collation type %s", collation_argument.c_str());
+	}
+}
+
 bool ApproxEqual(float ldecimal, float rdecimal) {
 	float epsilon = fabs(rdecimal) * 0.01;
 	return fabs(ldecimal - rdecimal) <= epsilon;

diff --git a/src/execution/operator/helper/physical_pragma.cpp b/src/execution/operator/helper/physical_pragma.cpp
@@ -68,6 +68,17 @@ void PhysicalPragma::GetChunkInternal(ClientContext &context, DataChunk &chunk,
 				    "Memory limit must be an assignment with a memory unit (e.g. PRAGMA memory_limit='1GB')");
 			}
 		}
+	} else if (keyword == "collation" || keyword == "default_collation") {
+		if (pragma.pragma_type != PragmaType::ASSIGNMENT) {
+			throw ParserException("Collation must be an assignment (e.g. PRAGMA default_collation=NOCASE)");
+		}
+		CollationType collation = CollationType::COLLATE_DEFAULT;
+		auto collation_param = StringUtil::Lower(pragma.parameters[0].CastAs(TypeId::VARCHAR).str_value);
+		auto splits = StringUtil::Split(collation_param, ".");
+		for(auto &collation_argument : splits) {
+			collation = ParseCollation(collation_argument, collation);
+		}
+		context.db.collation = collation;
 	} else {
 		throw ParserException("Unrecognized PRAGMA keyword: %s", keyword.c_str());
 	}

diff --git a/src/function/function.cpp b/src/function/function.cpp
@@ -52,6 +52,13 @@ void BuiltinFunctions::AddFunction(ScalarFunction function) {
 	catalog.CreateFunction(context, &info);
 }
 
+void BuiltinFunctions::AddFunction(vector<string> names, ScalarFunction function) {
+	for(auto &name: names) {
+		function.name = name;
+		AddFunction(function);
+	}
+}
+
 void BuiltinFunctions::AddFunction(ScalarFunctionSet set) {
 	CreateScalarFunctionInfo info(set);
 	catalog.CreateFunction(context, &info);

diff --git a/src/function/scalar/date/current.cpp b/src/function/scalar/date/current.cpp
@@ -31,25 +31,15 @@ static void current_timestamp_function(DataChunk &input, ExpressionState &state,
 }
 
 void CurrentTimeFun::RegisterFunction(BuiltinFunctions &set) {
-	ScalarFunctionSet current_time("current_time");
-	current_time.AddFunction(ScalarFunction({}, SQLType::TIME, current_time_function));
-	set.AddFunction(current_time);
+	set.AddFunction(ScalarFunction("current_time", {}, SQLType::TIME, current_time_function));
 }
 
 void CurrentDateFun::RegisterFunction(BuiltinFunctions &set) {
-	ScalarFunctionSet current_date("current_date");
-	current_date.AddFunction(ScalarFunction({}, SQLType::DATE, current_date_function));
-	set.AddFunction(current_date);
+	set.AddFunction(ScalarFunction("current_date", {}, SQLType::DATE, current_date_function));
 }
 
 void CurrentTimestampFun::RegisterFunction(BuiltinFunctions &set) {
-	ScalarFunctionSet current_timestamp("current_timestamp");
-	current_timestamp.AddFunction(ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
-	set.AddFunction(current_timestamp);
-
-	ScalarFunctionSet now("now");
-	now.AddFunction(ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
-	set.AddFunction(now);
+	set.AddFunction({"now", "current_timestamp"}, ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
 }
 
 } // namespace duckdb
diff --git a/src/function/scalar/math/numeric.cpp b/src/function/scalar/math/numeric.cpp
@@ -258,12 +258,7 @@ struct Log10Operator {
 };
 
 void Log10Fun::RegisterFunction(BuiltinFunctions &set) {
-	ScalarFunction log_function("log10", {SQLType::DOUBLE}, SQLType::DOUBLE,
-	                            UnaryDoubleFunctionWrapper<double, Log10Operator>);
-	set.AddFunction(log_function);
-	// "log" is an alias for "log10"
-	log_function.name = "log";
-	set.AddFunction(log_function);
+	set.AddFunction({"log10", "log"}, ScalarFunction({SQLType::DOUBLE}, SQLType::DOUBLE, UnaryDoubleFunctionWrapper<double, Log10Operator>));
 }
 
 //===--------------------------------------------------------------------===//

diff --git a/src/function/scalar/string/CMakeLists.txt b/src/function/scalar/string/CMakeLists.txt
@@ -11,6 +11,7 @@ add_library_unity(
   substring.cpp
   instr.cpp
   prefix.cpp
+  strip_accents.cpp
   suffix.cpp
   contains.cpp)
 set(ALL_OBJECT_FILES

diff --git a/src/function/scalar/string/caseconvert.cpp b/src/function/scalar/string/caseconvert.cpp
@@ -3,63 +3,92 @@
 #include "duckdb/common/exception.hpp"
 #include "duckdb/common/vector_operations/vector_operations.hpp"
 #include "duckdb/common/vector_operations/unary_executor.hpp"
+#include "utf8proc.hpp"
 
 #include <string.h>
 
 using namespace std;
 
 namespace duckdb {
 
-// TODO: this does not handle UTF characters yet.
-template <class OP> static void strcase(const char *input_data, idx_t input_length, char *output) {
-	for (idx_t i = 0; i < input_length; i++) {
-		output[i] = OP::Operation(input_data[i]);
+template <bool IS_UPPER>
+static string_t strcase_unicode(Vector &result, const char *input_data, idx_t input_length) {
+	// first figure out the output length
+	// optimization: if only ascii then input_length = output_length
+	idx_t output_length = 0;
+	for (idx_t i = 0; i < input_length;) {
+		if (input_data[i] & 0x80) {
+			// unicode
+			int sz = 0;
+			int codepoint = utf8proc_codepoint(input_data + i, sz);
+			int converted_codepoint = IS_UPPER ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+			sz = utf8proc_codepoint_length(converted_codepoint);
+			if (sz < 0) {
+				throw InternalException("Invalid UTF8 encountered!");
+			}
+			output_length += sz;
+			i += sz;
+		} else {
+			// ascii
+			output_length++;
+			i++;
+		}
 	}
-	output[input_length] = '\0';
+	auto result_str = StringVector::EmptyString(result, output_length);
+	auto result_data = result_str.GetData();
+
+	for (idx_t i = 0; i < input_length;) {
+		if (input_data[i] & 0x80) {
+			// non-ascii character
+			int sz = 0;
+			int codepoint = utf8proc_codepoint(input_data + i, sz);
+			int converted_codepoint = IS_UPPER ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+			if (!utf8proc_codepoint_to_utf8(converted_codepoint, sz, result_data)) {
+				throw InternalException("Invalid UTF8 encountered!");
+			}
+			result_data += sz;
+			i += sz;
+		} else {
+			// ascii
+			*result_data = IS_UPPER ? toupper(input_data[i]) : tolower(input_data[i]);
+			result_data++;
+			i++;
+		}
+	}
+	result_str.Finalize();
+	return result_str;
 }
 
-template <class OP> static void caseconvert_function(Vector &input, Vector &result, idx_t count) {
+template <bool IS_UPPER> static void caseconvert_function(Vector &input, Vector &result, idx_t count) {
 	assert(input.type == TypeId::VARCHAR);
 
 	UnaryExecutor::Execute<string_t, string_t, true>(input, result, count, [&](string_t input) {
 		auto input_data = input.GetData();
 		auto input_length = input.GetSize();
-
-		auto target = StringVector::EmptyString(result, input_length);
-		strcase<OP>(input_data, input_length, target.GetData());
-		target.Finalize();
-		return target;
+		return strcase_unicode<IS_UPPER>(result, input_data, input_length);
 	});
 }
 
-struct StringToUpper {
-	static char Operation(char input) {
-		return toupper(input);
-	}
-};
-
-struct StringToLower {
-	static char Operation(char input) {
-		return tolower(input);
-	}
-};
-
 static void caseconvert_upper_function(DataChunk &args, ExpressionState &state, Vector &result) {
 	assert(args.column_count() == 1);
-	caseconvert_function<StringToUpper>(args.data[0], result, args.size());
+	caseconvert_function<true>(args.data[0], result, args.size());
 }
 
 static void caseconvert_lower_function(DataChunk &args, ExpressionState &state, Vector &result) {
 	assert(args.column_count() == 1);
-	caseconvert_function<StringToLower>(args.data[0], result, args.size());
+	caseconvert_function<false>(args.data[0], result, args.size());
+}
+
+ScalarFunction LowerFun::GetFunction() {
+	return ScalarFunction({SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_lower_function);
 }
 
 void LowerFun::RegisterFunction(BuiltinFunctions &set) {
-	set.AddFunction(ScalarFunction("lower", {SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_lower_function));
+	set.AddFunction({"lower", "lcase"}, LowerFun::GetFunction());
 }
 
 void UpperFun::RegisterFunction(BuiltinFunctions &set) {
-	set.AddFunction(ScalarFunction("upper", {SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_upper_function));
+	set.AddFunction({"upper", "ucase"}, ScalarFunction({SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_upper_function));
 }
 
 } // namespace duckdb
diff --git a/src/function/scalar/string/length.cpp b/src/function/scalar/string/length.cpp
@@ -2,26 +2,44 @@
 
 #include "duckdb/common/exception.hpp"
 #include "duckdb/common/vector_operations/vector_operations.hpp"
+#include "utf8proc.hpp"
 
 using namespace std;
 
 namespace duckdb {
 
+// length returns the size in characters
 struct StringLengthOperator {
 	template <class TA, class TR> static inline TR Operation(TA input) {
-		int64_t length = 0;
 		auto input_data = input.GetData();
 		auto input_length = input.GetSize();
 		for (idx_t i = 0; i < input_length; i++) {
-			length += (input_data[i] & 0xC0) != 0x80;
+			if (input_data[i] & 0x80) {
+				int64_t length = 0;
+				// non-ascii character: use grapheme iterator on remainder of string
+				utf8proc_grapheme_callback(input_data, input_length, [&](size_t start, size_t end) {
+					length++;
+					return true;
+				});
+				return length;
+			}
 		}
-		return length;
+		return input_length;
+	}
+};
+
+// strlen returns the size in bytes
+struct StrLenOperator {
+	template <class TA, class TR> static inline TR Operation(TA input) {
+		return input.GetSize();
 	}
 };
 
 void LengthFun::RegisterFunction(BuiltinFunctions &set) {
-	set.AddFunction(ScalarFunction("length", {SQLType::VARCHAR}, SQLType::BIGINT,
+	set.AddFunction({"length", "len"}, ScalarFunction({SQLType::VARCHAR}, SQLType::BIGINT,
 	                               ScalarFunction::UnaryFunction<string_t, int64_t, StringLengthOperator, true>));
+	set.AddFunction(ScalarFunction("strlen", {SQLType::VARCHAR}, SQLType::BIGINT,
+	                               ScalarFunction::UnaryFunction<string_t, int64_t, StrLenOperator, true>));
 }
 
 } // namespace duckdb
diff --git a/src/function/scalar/string/prefix.cpp b/src/function/scalar/string/prefix.cpp
@@ -46,17 +46,14 @@ static bool prefix(const string_t &str, const string_t &pattern) {
 			}
 		}
 		// compare the rest of the prefix
-		bool equal;
-		uint32_t num_char_equals = string_t::PREFIX_LENGTH;
 		const char *str_data = str.GetData();
 		const char *patt_data = pattern.GetData();
-
 		for (idx_t i = string_t::PREFIX_LENGTH; i < patt_length; ++i) {
-			equal = (str_data[i] == patt_data[i]); // removed branch
-			num_char_equals += equal;
+			if (str_data[i] != patt_data[i]) {
+				return false;
+			}
 		}
-
-		return (num_char_equals == patt_length);
+		return true;
 	}
 }