Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2927f78
Reverse now reverses grapheme clusters in unicode text (fixes #567)
Mytherin Apr 18, 2020
265e14a
Use grapheme clusters in linenoise as well
Mytherin Apr 18, 2020
f229371
Also consider render width in linenoise
Mytherin Apr 18, 2020
fdc073e
Add utf8proc_grapheme_callback method, that cycles over all grapheme …
Mytherin Apr 18, 2020
ca6ec82
Correct substring with grapheme clusters
Mytherin Apr 18, 2020
027c0ea
Fix Ctrl+T with unicode characters in linenoise
Mytherin Apr 18, 2020
a661092
Unicode upper/lower support
Mytherin Apr 18, 2020
c868205
Add missing file
Mytherin Apr 18, 2020
dee9413
For substring ascii test check one character past the offset+length t…
Mytherin Apr 18, 2020
ab6a58a
Merge branch 'master' into utf8fix
Mytherin Apr 19, 2020
b482424
Minor upper/lower rework: first figure out length of result and then …
Mytherin Apr 19, 2020
ed4079d
Add travis_wait 30 to OpenBSD/Solaris builds
Mytherin Apr 19, 2020
f5d8f67
Correctly support non-ascii prompt again (🦆)
Mytherin Apr 19, 2020
55c5645
Fix for scrolling behavior in linenoise with UTF characters
Mytherin Apr 19, 2020
e55a796
Always refresh linenoise display after adding a new character to corr…
Mytherin Apr 19, 2020
a4b594a
Add unicode NFC normalization test to CSV reader test suite
Mytherin Apr 19, 2020
c2c6960
Add strip_accents function
Mytherin Apr 20, 2020
9a8e34a
Add support for COLLATE NOACCENT and COLLATE NOCASE in table columns
Mytherin Apr 20, 2020
9ef0294
Merge branch 'master' into utf8fix
Mytherin Apr 23, 2020
bf307a0
Add support for default collation type. The collation can be set on d…
Mytherin Apr 23, 2020
eada68b
Add support for COLLATE in expressions
Mytherin Apr 23, 2020
8cd206e
More easy aliasing of scalar functions
Mytherin Apr 23, 2020
2ea4ad5
Fix typo
Mytherin Apr 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ matrix:
script:
- python scripts/amalgamation.py > /dev/null
- rsync -a -e "ssh $SSHFLAGS -p 2222" --exclude=.git --exclude=build --exclude=third_party/sqllogictest --exclude=third_party/imdb . root@localhost:/duckdb
- $SCMD 'rm -rf /duckdb/build && mkdir -p /duckdb/build && cd /duckdb/build && export PATH=/opt/csw/bin/:$PATH CXX=g++ CC=gcc && cmake -DCMAKE_AR=/opt/csw/bin/gar -DCMAKE_BUILD_TYPE=Debug -DAMALGAMATION_BUILD=1 .. && gmake -j2'
- travis_wait 30 $SCMD 'rm -rf /duckdb/build && mkdir -p /duckdb/build && cd /duckdb/build && export PATH=/opt/csw/bin/:$PATH CXX=g++ CC=gcc && cmake -DCMAKE_AR=/opt/csw/bin/gar -DCMAKE_BUILD_TYPE=Debug -DAMALGAMATION_BUILD=1 .. && gmake -j2'
- $SCMD /duckdb/build/test/unittest "~[copy]~[file_system]~[.]"


Expand Down
37 changes: 35 additions & 2 deletions src/common/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,15 @@ void SQLType::Serialize(Serializer &serializer) {
serializer.Write(id);
serializer.Write(width);
serializer.Write(scale);
serializer.Write<CollationType>(collation);
}

SQLType SQLType::Deserialize(Deserializer &source) {
auto id = source.Read<SQLTypeId>();
auto width = source.Read<uint16_t>();
auto scale = source.Read<uint8_t>();
return SQLType(id, width, scale);
auto collation = source.Read<CollationType>();
return SQLType(id, width, scale, collation);
}

string SQLTypeIdToString(SQLTypeId id) {
Expand Down Expand Up @@ -344,13 +346,44 @@ SQLType MaxSQLType(SQLType left, SQLType right) {
return right;
} else if (right.id < left.id) {
return left;
} else if (left.width > right.width) {
} else if (left.width > right.width || left.collation > right.collation) {
return left;
} else {
return right;
}
}

CollationType ParseCollation(string collation_argument, CollationType collation) {
if (collation_argument == "nocase") {
switch(collation) {
case CollationType::COLLATE_DEFAULT:
return CollationType::COLLATE_NOCASE;
case CollationType::COLLATE_NOACCENT:
return CollationType::COLLATE_NOCASE_NOACCENT;
default:
throw ParserException("Unexpected NOCASE collation!");
}
} else if (collation_argument == "noaccent") {
switch(collation) {
case CollationType::COLLATE_DEFAULT:
return CollationType::COLLATE_NOACCENT;
case CollationType::COLLATE_NOCASE:
return CollationType::COLLATE_NOCASE_NOACCENT;
default:
throw ParserException("Unexpected NOACCENT collation!");
}
} else if (collation_argument == "binary" || collation_argument == "c" || collation_argument == "posix") {
switch(collation) {
case CollationType::COLLATE_DEFAULT:
return CollationType::COLLATE_NONE;
default:
throw ParserException("Unexpected BINARY collation!");
}
} else {
throw ParserException("Unsupported collation type %s", collation_argument.c_str());
}
}

bool ApproxEqual(float ldecimal, float rdecimal) {
float epsilon = fabs(rdecimal) * 0.01;
return fabs(ldecimal - rdecimal) <= epsilon;
Expand Down
11 changes: 11 additions & 0 deletions src/execution/operator/helper/physical_pragma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ void PhysicalPragma::GetChunkInternal(ClientContext &context, DataChunk &chunk,
"Memory limit must be an assignment with a memory unit (e.g. PRAGMA memory_limit='1GB')");
}
}
} else if (keyword == "collation" || keyword == "default_collation") {
if (pragma.pragma_type != PragmaType::ASSIGNMENT) {
throw ParserException("Collation must be an assignment (e.g. PRAGMA default_collation=NOCASE)");
}
CollationType collation = CollationType::COLLATE_DEFAULT;
auto collation_param = StringUtil::Lower(pragma.parameters[0].CastAs(TypeId::VARCHAR).str_value);
auto splits = StringUtil::Split(collation_param, ".");
for(auto &collation_argument : splits) {
collation = ParseCollation(collation_argument, collation);
}
context.db.collation = collation;
} else {
throw ParserException("Unrecognized PRAGMA keyword: %s", keyword.c_str());
}
Expand Down
7 changes: 7 additions & 0 deletions src/function/function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ void BuiltinFunctions::AddFunction(ScalarFunction function) {
catalog.CreateFunction(context, &info);
}

void BuiltinFunctions::AddFunction(vector<string> names, ScalarFunction function) {
for(auto &name: names) {
function.name = name;
AddFunction(function);
}
}

void BuiltinFunctions::AddFunction(ScalarFunctionSet set) {
CreateScalarFunctionInfo info(set);
catalog.CreateFunction(context, &info);
Expand Down
16 changes: 3 additions & 13 deletions src/function/scalar/date/current.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,15 @@ static void current_timestamp_function(DataChunk &input, ExpressionState &state,
}

void CurrentTimeFun::RegisterFunction(BuiltinFunctions &set) {
ScalarFunctionSet current_time("current_time");
current_time.AddFunction(ScalarFunction({}, SQLType::TIME, current_time_function));
set.AddFunction(current_time);
set.AddFunction(ScalarFunction("current_time", {}, SQLType::TIME, current_time_function));
}

void CurrentDateFun::RegisterFunction(BuiltinFunctions &set) {
ScalarFunctionSet current_date("current_date");
current_date.AddFunction(ScalarFunction({}, SQLType::DATE, current_date_function));
set.AddFunction(current_date);
set.AddFunction(ScalarFunction("current_date", {}, SQLType::DATE, current_date_function));
}

void CurrentTimestampFun::RegisterFunction(BuiltinFunctions &set) {
ScalarFunctionSet current_timestamp("current_timestamp");
current_timestamp.AddFunction(ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
set.AddFunction(current_timestamp);

ScalarFunctionSet now("now");
now.AddFunction(ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
set.AddFunction(now);
set.AddFunction({"now", "current_timestamp"}, ScalarFunction({}, SQLType::TIMESTAMP, current_timestamp_function));
}

} // namespace duckdb
7 changes: 1 addition & 6 deletions src/function/scalar/math/numeric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,7 @@ struct Log10Operator {
};

void Log10Fun::RegisterFunction(BuiltinFunctions &set) {
ScalarFunction log_function("log10", {SQLType::DOUBLE}, SQLType::DOUBLE,
UnaryDoubleFunctionWrapper<double, Log10Operator>);
set.AddFunction(log_function);
// "log" is an alias for "log10"
log_function.name = "log";
set.AddFunction(log_function);
set.AddFunction({"log10", "log"}, ScalarFunction({SQLType::DOUBLE}, SQLType::DOUBLE, UnaryDoubleFunctionWrapper<double, Log10Operator>));
}

//===--------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions src/function/scalar/string/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_library_unity(
substring.cpp
instr.cpp
prefix.cpp
strip_accents.cpp
suffix.cpp
contains.cpp)
set(ALL_OBJECT_FILES
Expand Down
83 changes: 56 additions & 27 deletions src/function/scalar/string/caseconvert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,92 @@
#include "duckdb/common/exception.hpp"
#include "duckdb/common/vector_operations/vector_operations.hpp"
#include "duckdb/common/vector_operations/unary_executor.hpp"
#include "utf8proc.hpp"

#include <string.h>

using namespace std;

namespace duckdb {

// TODO: this does not handle UTF characters yet.
template <class OP> static void strcase(const char *input_data, idx_t input_length, char *output) {
for (idx_t i = 0; i < input_length; i++) {
output[i] = OP::Operation(input_data[i]);
template <bool IS_UPPER>
static string_t strcase_unicode(Vector &result, const char *input_data, idx_t input_length) {
// first figure out the output length
// optimization: if only ascii then input_length = output_length
idx_t output_length = 0;
for (idx_t i = 0; i < input_length;) {
if (input_data[i] & 0x80) {
// unicode
int sz = 0;
int codepoint = utf8proc_codepoint(input_data + i, sz);
int converted_codepoint = IS_UPPER ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
sz = utf8proc_codepoint_length(converted_codepoint);
if (sz < 0) {
throw InternalException("Invalid UTF8 encountered!");
}
output_length += sz;
i += sz;
} else {
// ascii
output_length++;
i++;
}
}
output[input_length] = '\0';
auto result_str = StringVector::EmptyString(result, output_length);
auto result_data = result_str.GetData();

for (idx_t i = 0; i < input_length;) {
if (input_data[i] & 0x80) {
// non-ascii character
int sz = 0;
int codepoint = utf8proc_codepoint(input_data + i, sz);
int converted_codepoint = IS_UPPER ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
if (!utf8proc_codepoint_to_utf8(converted_codepoint, sz, result_data)) {
throw InternalException("Invalid UTF8 encountered!");
}
result_data += sz;
i += sz;
} else {
// ascii
*result_data = IS_UPPER ? toupper(input_data[i]) : tolower(input_data[i]);
result_data++;
i++;
}
}
result_str.Finalize();
return result_str;
}

template <class OP> static void caseconvert_function(Vector &input, Vector &result, idx_t count) {
template <bool IS_UPPER> static void caseconvert_function(Vector &input, Vector &result, idx_t count) {
assert(input.type == TypeId::VARCHAR);

UnaryExecutor::Execute<string_t, string_t, true>(input, result, count, [&](string_t input) {
auto input_data = input.GetData();
auto input_length = input.GetSize();

auto target = StringVector::EmptyString(result, input_length);
strcase<OP>(input_data, input_length, target.GetData());
target.Finalize();
return target;
return strcase_unicode<IS_UPPER>(result, input_data, input_length);
});
}

struct StringToUpper {
static char Operation(char input) {
return toupper(input);
}
};

struct StringToLower {
static char Operation(char input) {
return tolower(input);
}
};

static void caseconvert_upper_function(DataChunk &args, ExpressionState &state, Vector &result) {
assert(args.column_count() == 1);
caseconvert_function<StringToUpper>(args.data[0], result, args.size());
caseconvert_function<true>(args.data[0], result, args.size());
}

static void caseconvert_lower_function(DataChunk &args, ExpressionState &state, Vector &result) {
assert(args.column_count() == 1);
caseconvert_function<StringToLower>(args.data[0], result, args.size());
caseconvert_function<false>(args.data[0], result, args.size());
}

ScalarFunction LowerFun::GetFunction() {
return ScalarFunction({SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_lower_function);
}

void LowerFun::RegisterFunction(BuiltinFunctions &set) {
set.AddFunction(ScalarFunction("lower", {SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_lower_function));
set.AddFunction({"lower", "lcase"}, LowerFun::GetFunction());
}

void UpperFun::RegisterFunction(BuiltinFunctions &set) {
set.AddFunction(ScalarFunction("upper", {SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_upper_function));
set.AddFunction({"upper", "ucase"}, ScalarFunction({SQLType::VARCHAR}, SQLType::VARCHAR, caseconvert_upper_function));
}

} // namespace duckdb
26 changes: 22 additions & 4 deletions src/function/scalar/string/length.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,44 @@

#include "duckdb/common/exception.hpp"
#include "duckdb/common/vector_operations/vector_operations.hpp"
#include "utf8proc.hpp"

using namespace std;

namespace duckdb {

// length returns the size in characters
struct StringLengthOperator {
template <class TA, class TR> static inline TR Operation(TA input) {
int64_t length = 0;
auto input_data = input.GetData();
auto input_length = input.GetSize();
for (idx_t i = 0; i < input_length; i++) {
length += (input_data[i] & 0xC0) != 0x80;
if (input_data[i] & 0x80) {
int64_t length = 0;
// non-ascii character: use grapheme iterator on remainder of string
utf8proc_grapheme_callback(input_data, input_length, [&](size_t start, size_t end) {
length++;
return true;
});
return length;
}
}
return length;
return input_length;
}
};

// strlen returns the size in bytes
struct StrLenOperator {
template <class TA, class TR> static inline TR Operation(TA input) {
return input.GetSize();
}
};

void LengthFun::RegisterFunction(BuiltinFunctions &set) {
set.AddFunction(ScalarFunction("length", {SQLType::VARCHAR}, SQLType::BIGINT,
set.AddFunction({"length", "len"}, ScalarFunction({SQLType::VARCHAR}, SQLType::BIGINT,
ScalarFunction::UnaryFunction<string_t, int64_t, StringLengthOperator, true>));
set.AddFunction(ScalarFunction("strlen", {SQLType::VARCHAR}, SQLType::BIGINT,
ScalarFunction::UnaryFunction<string_t, int64_t, StrLenOperator, true>));
}

} // namespace duckdb
11 changes: 4 additions & 7 deletions src/function/scalar/string/prefix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,14 @@ static bool prefix(const string_t &str, const string_t &pattern) {
}
}
// compare the rest of the prefix
bool equal;
uint32_t num_char_equals = string_t::PREFIX_LENGTH;
const char *str_data = str.GetData();
const char *patt_data = pattern.GetData();

for (idx_t i = string_t::PREFIX_LENGTH; i < patt_length; ++i) {
equal = (str_data[i] == patt_data[i]); // removed branch
num_char_equals += equal;
if (str_data[i] != patt_data[i]) {
return false;
}
}

return (num_char_equals == patt_length);
return true;
}
}

Expand Down
Loading