diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index 043febb9f..ab7556328 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -1,6 +1,8 @@ name: Publish Python 🐍 distribution 📦 to PyPI -on: push +on: + release: + types: [published] jobs: build: @@ -31,7 +33,7 @@ jobs: publish-to-pypi: name: >- Publish Python 🐍 distribution 📦 to PyPI - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + # if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - build runs-on: ubuntu-latest @@ -71,14 +73,14 @@ jobs: inputs: >- ./dist/*.tar.gz ./dist/*.whl - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - run: >- - gh release create - "$GITHUB_REF_NAME" - --repo "$GITHUB_REPOSITORY" - --notes "" + # - name: Create GitHub Release + # env: + # GITHUB_TOKEN: ${{ github.token }} + # run: >- + # gh release create + # "$GITHUB_REF_NAME" + # --repo "$GITHUB_REPOSITORY" + # --notes "" - name: Upload artifact signatures to GitHub Release env: GITHUB_TOKEN: ${{ github.token }} diff --git a/bigfun/bigfunctions.py b/bigfun/bigfunctions.py index 81ba89e0a..4338ebf8c 100644 --- a/bigfun/bigfunctions.py +++ b/bigfun/bigfunctions.py @@ -224,7 +224,6 @@ def _deploy_npm_packages(self): def _deploy_cloud_run(self): cloud_run_service = 'bf-' + self.name.replace("_", "-") cloud_run_location = {'EU': 'europe-west1', 'US': 'us-west1'}.get(self.dataset.location, self.dataset.location) - self.config['cloud_run_location'] = cloud_run_location with tempfile.TemporaryDirectory() as folder: remote_connection = self.bigquery.get_or_create_remote_connection(self.project, self.dataset.location, REMOTE_CONNECTION_NAME) self.bigquery.set_remote_connection_users(remote_connection.name, self.dataset.users) diff --git a/bigfun/templates/function_py.py b/bigfun/templates/function_py.py index 55c6b261a..0231104f9 100644 --- a/bigfun/templates/function_py.py +++ b/bigfun/templates/function_py.py @@ -20,7 +20,6 @@ CACHE = {} -CURRENT_LOCATION = '{{ cloud_run_location }}' QUOTAS = {{ quotas if quotas is defined else {} }} @@ -43,7 +42,7 @@ def get_current_service_account(): def create_temp_dataset(default_table_expiration_days=0.042): import google.cloud.bigquery - bigquery = google.cloud.bigquery.Client(location=g.bigfunction_dataset_location) + bigquery = google.cloud.bigquery.Client(location=g.dataset_location) random_id = str(uuid.uuid4()).replace('-', '_') dataset_id = f'{PROJECT}.temp_{random_id}' is_user_service_account = 'iam.gserviceaccount.com' in g.user @@ -78,7 +77,8 @@ def init_global_context(data): g.user = data['sessionUser'] user_project_matches = re.findall(r'bigquery.googleapis.com/projects/([^/]*)/', data['caller']) g.user_project = user_project_matches[0] if user_project_matches else None - g.bigfunction_dataset_location = data.get('userDefinedContext', {}).get('dataset_location') + g.dataset_location = data.get('userDefinedContext', {}).get('dataset_location') + g.region = {'EU': 'europe-west1', 'US': 'us-west1'}.get(g.dataset_location, g.dataset_location) diff --git a/bigfun/templates/function_py_test.py b/bigfun/templates/function_py_test.py index eaaf09c05..2f1e635c3 100644 --- a/bigfun/templates/function_py_test.py +++ b/bigfun/templates/function_py_test.py @@ -1,5 +1,4 @@ CACHE = {} -CURRENT_LOCATION = 'europe-west1' {{ init_code }} diff --git a/bigfun/templates/function_sql.sql b/bigfun/templates/function_sql.sql index eb5c84a95..1169ff522 100644 --- a/bigfun/templates/function_sql.sql +++ b/bigfun/templates/function_sql.sql @@ -1,7 +1,7 @@ create or replace function `{{ project }}`.`{{ dataset }}`.{{ name }}({% for argument in arguments %}`{{ argument.name }}` {{ argument.type}}{% if not loop.last %}, {% endif %}{% endfor %}) {% if output.type != 'any type' %}returns {{ output.type }}{% endif %} as ( -{{ code | replace('{BIGFUNCTIONS_DATASET}', '`' + project + '`.`' + dataset + '`' ) }} + ({{ code | replace('{BIGFUNCTIONS_DATASET}', '`' + project + '`.`' + dataset + '`' ) }}) ) options( description = '''{{ description }}''' diff --git a/bigfun/templates/mkdocs.yml b/bigfun/templates/mkdocs.yml index 448717643..585873f4f 100644 --- a/bigfun/templates/mkdocs.yml +++ b/bigfun/templates/mkdocs.yml @@ -34,9 +34,10 @@ markdown_extensions: alternate_style: true plugins: - search + - awesome-nav nav: - Home: index.md - - BigFunctions!: bigfunctions/README.md + - "{BIGFUNCTIONS_DOC}" validation: nav: omitted_files: ignore diff --git a/bigfunctions/take_actions/export/export_to_storage.yaml b/bigfunctions/take_actions/export/export_to_storage.yaml index ca561f027..b55073875 100644 --- a/bigfunctions/take_actions/export/export_to_storage.yaml +++ b/bigfunctions/take_actions/export/export_to_storage.yaml @@ -10,6 +10,8 @@ arguments: type: string - name: content type: string + - name: content_type + type: string output: name: status type: string @@ -29,6 +31,7 @@ code: | #python return 'NO_PATH_PROVIDED' if not content: return 'NO_CONTENT_PROVIDED' + content_type = content_type or 'text/plain' path = path.replace('gs://', '') if '/' not in path: return 'INVALID_PATH_PROVIDED' @@ -36,7 +39,7 @@ code: | #python bucket = storage.bucket(bucket_name) blob = bucket.blob(filename) try: - blob.upload_from_string(content) + blob.upload_from_string(content, content_type=content_type) except google.api_core.exceptions.NotFound: assert False, f'Given bucket does not exist OR service Account `{get_current_service_account()}` does not have objectCreator role' except (google.api_core.exceptions.PermissionDenied, google.api_core.exceptions.Forbidden): diff --git a/bigfunctions/take_actions/trigger_event/trigger_dataform.yaml b/bigfunctions/take_actions/trigger_event/trigger_dataform.yaml new file mode 100644 index 000000000..d6891bd0b --- /dev/null +++ b/bigfunctions/take_actions/trigger_event/trigger_dataform.yaml @@ -0,0 +1,113 @@ +type: function_py +author: Moldavanka +description: | + Triggers a Dataform workflow in the specified project, location, repository and workspace. + Returns the workflow invocation ID if successful. + +arguments: + - name: repo_location + type: string + description: The GCP location of the Dataform repository (e.g., 'us-central1') + - name: repo_name + type: string + description: The name of the Dataform repository + - name: repo_workspace + type: string + description: The name of the Dataform workspace + - name: project_id + type: string + description: The GCP project ID where the Dataform repository is located + - name: compilation_result_id + type: string + description: Optional compilation result ID. If provided, uses this instead of creating a new compilation + default: "" + +output: + name: invocation_result + type: string + description: JSON string with workflow invocation ID or error message + +examples: + - description: "Trigger a Dataform workflow with a new compilation" + arguments: + - "'us-central1'" + - "'my-repo'" + - "'my-workspace'" + - "'my-project'" + output: '{"workflow_invocation_id": "workflow_invocation_123456"}' + +code: | #python + import os + import json + from google.cloud import dataform + + # Parse the input parameters + try: + # Initialize the Dataform client + client = dataform.DataformClient() + + # Set up repository path + repo_path = client.repository_path( + project=project_id, location=repo_location, repository=repo_name + ) + + # If compilation_result_id is provided, use it + if compilation_result_id: + compilation_result_path = client.compilation_result_path( + project=project_id, + location=repo_location, + repository=repo_name, + compilation_result=compilation_result_id + ) + + # Create a compilation result object + compilation_result = dataform.CompilationResult() + compilation_result.name = compilation_result_path + else: + # Need to compile from workspace + workspace_path = client.workspace_path( + project=project_id, + location=repo_location, + repository=repo_name, + workspace=repo_workspace, + ) + + # Create a new compilation + compilation_result = dataform.CompilationResult() + compilation_result.workspace = workspace_path + + # Request to create the compilation + create_compilation_request = dataform.CreateCompilationResultRequest() + create_compilation_request.parent = repo_path + create_compilation_request.compilation_result = compilation_result + + # Execute the compilation + compilation_result = client.create_compilation_result(request=create_compilation_request) + + # Create workflow invocation + wf_invocation = dataform.WorkflowInvocation() + wf_invocation.compilation_result = compilation_result.name + + # Set up the request + request = dataform.CreateWorkflowInvocationRequest() + request.parent = repo_path + request.workflow_invocation = wf_invocation + + # Execute the workflow invocation + result = client.create_workflow_invocation(request=request) + + # Return the workflow invocation ID + invocation_id = result.name.split('/')[-1] + return json.dumps({"workflow_invocation_id": invocation_id}) + + except Exception as e: + return json.dumps({"error": str(e)}) + +requirements: | + google-cloud-dataform==0.5.2 + +cloud_run: + memory: 512Mi + cpu: 1 + max_instances: 5 + timeout: 300 diff --git a/bigfunctions/transform/ai/ask_ai.yaml b/bigfunctions/transform/ai/ask_ai.yaml index 39b3e40ea..773c4d7ed 100644 --- a/bigfunctions/transform/ai/ask_ai.yaml +++ b/bigfunctions/transform/ai/ask_ai.yaml @@ -58,7 +58,7 @@ init_code: | import vertexai from vertexai.preview.language_models import TextGenerationModel, CodeGenerationModel from vertexai.preview.generative_models import GenerativeModel - vertexai.init(location=CURRENT_LOCATION) + vertexai.init(location=g.region) code: | model = model or 'gemini-pro' prompt = (prompt or '').strip() diff --git a/bigfunctions/transform/ai/ask_appstore_reviews.yaml b/bigfunctions/transform/ai/ask_appstore_reviews.yaml index fe455f4dd..41b571855 100644 --- a/bigfunctions/transform/ai/ask_appstore_reviews.yaml +++ b/bigfunctions/transform/ai/ask_appstore_reviews.yaml @@ -31,7 +31,6 @@ examples: ... code: | - ( with reviews as ( @@ -55,4 +54,3 @@ code: | select {BIGFUNCTIONS_DATASET}.ask_ai(query, 'gemini-1.5-flash-preview-0514') from generated_prompt - ) diff --git a/bigfunctions/transform/ai/categorize.yaml b/bigfunctions/transform/ai/categorize.yaml index 6e4136905..780502965 100644 --- a/bigfunctions/transform/ai/categorize.yaml +++ b/bigfunctions/transform/ai/categorize.yaml @@ -132,7 +132,7 @@ init_code: | item['id'] = k return dicttoxml.dicttoxml(item, custom_root='item', attr_type=False, xml_declaration=False).decode('utf-8') - vertexai.init(location=CURRENT_LOCATION) + vertexai.init(location=g.region) model = GenerativeModel(MODEL) code: | if not items: diff --git a/bigfunctions/transform/ai/generate_categories.yaml b/bigfunctions/transform/ai/generate_categories.yaml index 6a5a3db7d..7da6263d3 100644 --- a/bigfunctions/transform/ai/generate_categories.yaml +++ b/bigfunctions/transform/ai/generate_categories.yaml @@ -123,7 +123,7 @@ init_code: | return dicttoxml.dicttoxml(item, custom_root='item', attr_type=False, xml_declaration=False).decode('utf-8') assert False, 'each item of `items` must be a string or an object' - vertexai.init(location=CURRENT_LOCATION) + vertexai.init(location=g.region) model = GenerativeModel(MODEL) code: | if not items: diff --git a/bigfunctions/transform/machine_learning/precision_recall_auc.yaml b/bigfunctions/transform/machine_learning/precision_recall_auc.yaml index 71c3cf4c2..9fa8172e2 100644 --- a/bigfunctions/transform/machine_learning/precision_recall_auc.yaml +++ b/bigfunctions/transform/machine_learning/precision_recall_auc.yaml @@ -19,10 +19,8 @@ examples: - "(select array_agg(struct(cast(predicted_score as float64), predicted_score > 500)) from unnest(generate_array(1, 1000)) as predicted_score)" output: "1.0" code: | - ( select sum(auc_contrib) from ( select (recall - lag(recall) over (order by recall)) * (precision + lag(precision) over (order by recall)) / 2 as auc_contrib from {BIGFUNCTIONS_DATASET}.precision_recall_curve(predictions) ) - ) diff --git a/bigfunctions/transform/machine_learning/roc_auc.yaml b/bigfunctions/transform/machine_learning/roc_auc.yaml index 722e6c98b..cefb0f122 100644 --- a/bigfunctions/transform/machine_learning/roc_auc.yaml +++ b/bigfunctions/transform/machine_learning/roc_auc.yaml @@ -23,10 +23,8 @@ examples: - "(select array_agg(struct(cast(predicted_score as float64), predicted_score < 500)) from unnest(generate_array(1, 1000)) as predicted_score)" output: "0.0" code: | - ( select sum(auc_contrib) from ( select (false_positive_rate - lag(false_positive_rate) over (order by false_positive_rate)) * (true_positive_rate + lag(true_positive_rate) over (order by false_positive_rate)) / 2 as auc_contrib from {BIGFUNCTIONS_DATASET}.roc_curve(predictions) ) - ) diff --git a/bigfunctions/transform/transform_array/are_arrays_equal.yaml b/bigfunctions/transform/transform_array/are_arrays_equal.yaml index f44467191..bc7ddc899 100644 --- a/bigfunctions/transform/transform_array/are_arrays_equal.yaml +++ b/bigfunctions/transform/transform_array/are_arrays_equal.yaml @@ -22,4 +22,4 @@ examples: - "[1, 4, 3]" - "[1, 4]" output: "false" -code: (to_json_string(array1) = to_json_string(array2)) +code: to_json_string(array1) = to_json_string(array2) diff --git a/bigfunctions/transform/transform_array/array_intersect.yaml b/bigfunctions/transform/transform_array/array_intersect.yaml index 316417bbd..e40b5c29c 100644 --- a/bigfunctions/transform/transform_array/array_intersect.yaml +++ b/bigfunctions/transform/transform_array/array_intersect.yaml @@ -18,10 +18,8 @@ examples: output: "[2]" region: ALL code: | - ( array( select * from unnest(array1) intersect distinct select * from unnest(array2) ) - ) diff --git a/bigfunctions/transform/transform_array/array_union.yaml b/bigfunctions/transform/transform_array/array_union.yaml index 955d6a8d9..8ee1d6756 100644 --- a/bigfunctions/transform/transform_array/array_union.yaml +++ b/bigfunctions/transform/transform_array/array_union.yaml @@ -3,7 +3,7 @@ author: Axel Thevenot description: | Returns the union of two arrays. arguments: - - name: array1 + - name: array11 type: any type - name: array2 type: any type @@ -18,10 +18,8 @@ examples: output: "[1, 2, 3, 6, 7]" region: ALL code: | - ( array( select * from unnest(array1) union distinct select * from unnest(array2) ) - ) diff --git a/bigfunctions/transform/transform_array/benford_distance.yaml b/bigfunctions/transform/transform_array/benford_distance.yaml index b00d886c3..ef161e127 100644 --- a/bigfunctions/transform/transform_array/benford_distance.yaml +++ b/bigfunctions/transform/transform_array/benford_distance.yaml @@ -34,7 +34,6 @@ examples: - "[9, 9, 9, 9, 9, 9, 9, 9, 9]" output: "20.7" code: | - ( with leading_digits as ( @@ -75,4 +74,3 @@ code: | from benfords_distribution left join observed_distribution using(leading_digit) - ) diff --git a/bigfunctions/transform/transform_array/find_greater_value.yaml b/bigfunctions/transform/transform_array/find_greater_value.yaml index f98ea33ef..e79be12b6 100644 --- a/bigfunctions/transform/transform_array/find_greater_value.yaml +++ b/bigfunctions/transform/transform_array/find_greater_value.yaml @@ -28,11 +28,9 @@ examples: - "110" output: "null" code: | - ( select offset from unnest(arr) as value with offset where value >= x order by offset limit 1 - ) diff --git a/bigfunctions/transform/transform_array/find_lower_value.yaml b/bigfunctions/transform/transform_array/find_lower_value.yaml index b3a336cb1..e638b4452 100644 --- a/bigfunctions/transform/transform_array/find_lower_value.yaml +++ b/bigfunctions/transform/transform_array/find_lower_value.yaml @@ -28,11 +28,9 @@ examples: - "2" output: "null" code: | - ( select offset from unnest(arr) as value with offset where value <= x order by offset limit 1 - ) diff --git a/bigfunctions/transform/transform_array/find_value.yaml b/bigfunctions/transform/transform_array/find_value.yaml index e9a0e1253..481b104c1 100644 --- a/bigfunctions/transform/transform_array/find_value.yaml +++ b/bigfunctions/transform/transform_array/find_value.yaml @@ -22,4 +22,4 @@ examples: - "[3, 4]" - "7" output: "null" -code: (select offset from unnest(arr) as x with offset where x = value order by offset limit 1) +code: select offset from unnest(arr) as x with offset where x = value order by offset limit 1 diff --git a/bigfunctions/transform/transform_array/frequent_values.yaml b/bigfunctions/transform/transform_array/frequent_values.yaml index 5b82343b2..e015364e5 100644 --- a/bigfunctions/transform/transform_array/frequent_values.yaml +++ b/bigfunctions/transform/transform_array/frequent_values.yaml @@ -23,7 +23,6 @@ examples: - "0.4" output: "[\"banana\"]" code: | - ( with value_counts as ( @@ -41,4 +40,3 @@ code: | ) select array_agg(value) from frequent_values - ) diff --git a/bigfunctions/transform/transform_array/get_value.yaml b/bigfunctions/transform/transform_array/get_value.yaml index 0ab97ecd5..4f9094246 100644 --- a/bigfunctions/transform/transform_array/get_value.yaml +++ b/bigfunctions/transform/transform_array/get_value.yaml @@ -27,4 +27,4 @@ examples: - "[struct('a' as key, 8 as value), struct('a' as key, 9 as value)]" - "'a'" output: "8" -code: (select value from unnest(key_value_items) item where item.key = search_key limit 1) +code: select value from unnest(key_value_items) item where item.key = search_key limit 1 diff --git a/bigfunctions/transform/transform_array/max_value.yaml b/bigfunctions/transform/transform_array/max_value.yaml index 1f080c11e..07d148ab0 100644 --- a/bigfunctions/transform/transform_array/max_value.yaml +++ b/bigfunctions/transform/transform_array/max_value.yaml @@ -14,4 +14,4 @@ examples: arguments: - "[1, 4, 3]" output: "4" -code: (select max(value) from unnest(arr) value) +code: select max(value) from unnest(arr) value diff --git a/bigfunctions/transform/transform_array/median_value.yaml b/bigfunctions/transform/transform_array/median_value.yaml index 04af76908..b16b354cb 100644 --- a/bigfunctions/transform/transform_array/median_value.yaml +++ b/bigfunctions/transform/transform_array/median_value.yaml @@ -18,4 +18,4 @@ examples: arguments: - "[1, 4, 3, 2]" output: "2.5" -code: (select percentile_cont(value, 0.5) over() from unnest(arr) value limit 1) +code: select percentile_cont(value, 0.5) over() from unnest(arr) value limit 1 diff --git a/bigfunctions/transform/transform_array/min_max_scaler.yaml b/bigfunctions/transform/transform_array/min_max_scaler.yaml index 01df130cd..7bfcc2206 100644 --- a/bigfunctions/transform/transform_array/min_max_scaler.yaml +++ b/bigfunctions/transform/transform_array/min_max_scaler.yaml @@ -15,7 +15,6 @@ examples: - "[1, 2, 3, 4, 5]" output: "[0, 0.25, 0.5, 0.75, 1]" code: | - ( with min_max as ( select min(x) as mn, @@ -28,4 +27,3 @@ code: | from unnest(arr) x, min_max ) - ) diff --git a/bigfunctions/transform/transform_array/min_value.yaml b/bigfunctions/transform/transform_array/min_value.yaml index 01d6ec538..a962e2324 100644 --- a/bigfunctions/transform/transform_array/min_value.yaml +++ b/bigfunctions/transform/transform_array/min_value.yaml @@ -14,4 +14,4 @@ examples: arguments: - "[1, 4, 3]" output: "1" -code: (select min(value) from unnest(arr) value) +code: select min(value) from unnest(arr) value diff --git a/bigfunctions/transform/transform_array/percentile_value.yaml b/bigfunctions/transform/transform_array/percentile_value.yaml index 88f9d0121..5c152d1f3 100644 --- a/bigfunctions/transform/transform_array/percentile_value.yaml +++ b/bigfunctions/transform/transform_array/percentile_value.yaml @@ -29,7 +29,6 @@ examples: - "2" output: 'null' code: | - ( select case when (percentile > 1) @@ -77,4 +76,3 @@ code: | ) end end - ) diff --git a/bigfunctions/transform/transform_array/rare_values.yaml b/bigfunctions/transform/transform_array/rare_values.yaml index 419f0aa00..8e63ed7ca 100644 --- a/bigfunctions/transform/transform_array/rare_values.yaml +++ b/bigfunctions/transform/transform_array/rare_values.yaml @@ -25,7 +25,6 @@ examples: - "0.2" output: "[\"cherry\"]" code: | - ( with value_counts as ( @@ -43,4 +42,3 @@ code: | ) select array_agg(value) from rare_values - ) diff --git a/bigfunctions/transform/transform_array/remove_value.yaml b/bigfunctions/transform/transform_array/remove_value.yaml index 01a310f86..095362e16 100644 --- a/bigfunctions/transform/transform_array/remove_value.yaml +++ b/bigfunctions/transform/transform_array/remove_value.yaml @@ -16,4 +16,4 @@ examples: - "[1, 4, 3, 8]" - "4" output: "[1, 3, 8]" -code: (select array_agg(v) from unnest(arr) v where v <> value) +code: select array_agg(v) from unnest(arr) v where v <> value diff --git a/bigfunctions/transform/transform_array/sum_values.yaml b/bigfunctions/transform/transform_array/sum_values.yaml index 542276653..edb6d7240 100644 --- a/bigfunctions/transform/transform_array/sum_values.yaml +++ b/bigfunctions/transform/transform_array/sum_values.yaml @@ -13,4 +13,4 @@ examples: arguments: - "[1, 4, 3]" output: "8" -code: (select sum(value) from unnest(arr) value) +code: select sum(value) from unnest(arr) value diff --git a/bigfunctions/transform/transform_array/z_scores.yaml b/bigfunctions/transform/transform_array/z_scores.yaml index f4df63b09..d8f213b55 100644 --- a/bigfunctions/transform/transform_array/z_scores.yaml +++ b/bigfunctions/transform/transform_array/z_scores.yaml @@ -17,7 +17,6 @@ examples: - "[1, 2, 3, 4, 5]" output: "[-1.414, -0.707, 0, 0.707, 1.414]" code: | - ( with stats as ( select avg(value) as mean, @@ -34,4 +33,3 @@ code: | ) from unnest(arr) as value, stats - ) diff --git a/bigfunctions/transform/transform_date/date_sub_isoyear.yaml b/bigfunctions/transform/transform_date/date_sub_isoyear.yaml index 035b78a9a..3c640d306 100644 --- a/bigfunctions/transform/transform_date/date_sub_isoyear.yaml +++ b/bigfunctions/transform/transform_date/date_sub_isoyear.yaml @@ -18,7 +18,6 @@ examples: - "3" output: "2020-05-29" code: | - ( with dates as ( select date_sub(date, interval x year) dt, @@ -43,4 +42,3 @@ code: | select date_sub(date, interval weeks_cnt week) from weeks_to_remove - ) diff --git a/bigfunctions/transform/transform_date/translated_month_name.yaml b/bigfunctions/transform/transform_date/translated_month_name.yaml index 969981c91..27de5d9c7 100644 --- a/bigfunctions/transform/transform_date/translated_month_name.yaml +++ b/bigfunctions/transform/transform_date/translated_month_name.yaml @@ -20,10 +20,8 @@ examples: - "'es'" output: "mayo" code: | - ( select translated_month from {BIGFUNCTIONS_DATASET}.translated_months where lower(locale) = lower(language) and month_nb = extract(month from date) - ) diff --git a/bigfunctions/transform/transform_date/translated_weekday_name.yaml b/bigfunctions/transform/transform_date/translated_weekday_name.yaml index 312a18b97..a86056267 100644 --- a/bigfunctions/transform/transform_date/translated_weekday_name.yaml +++ b/bigfunctions/transform/transform_date/translated_weekday_name.yaml @@ -20,10 +20,8 @@ examples: - "'fr'" output: "vendredi" code: | - ( select translated_day from {BIGFUNCTIONS_DATASET}.translated_days where lower(locale) = lower(language) and day_nb = extract(dayofweek from date) - ) diff --git a/bigfunctions/transform/transform_numeric/quantize_into_bins.yaml b/bigfunctions/transform/transform_numeric/quantize_into_bins.yaml index 46f8fe8b2..c37e57056 100644 --- a/bigfunctions/transform/transform_numeric/quantize_into_bins.yaml +++ b/bigfunctions/transform/transform_numeric/quantize_into_bins.yaml @@ -33,7 +33,6 @@ examples: - "[0, 1, 5, 10]" output: "]10, +∞[" code: | - ( select case when value < bin_bounds[offset(0)] then "]-∞, " || cast(bin_bounds[offset(0)] as string) || "[" @@ -59,4 +58,3 @@ code: | from offsets ) end - ) diff --git a/bigfunctions/transform/transform_numeric/quantize_into_bins_with_labels.yaml b/bigfunctions/transform/transform_numeric/quantize_into_bins_with_labels.yaml index 4cdb918fa..cc7d2506a 100644 --- a/bigfunctions/transform/transform_numeric/quantize_into_bins_with_labels.yaml +++ b/bigfunctions/transform/transform_numeric/quantize_into_bins_with_labels.yaml @@ -49,7 +49,6 @@ examples: - "['Lower than very bad!', 'Fail', 'Wait for result exam', 'Pass', 'Pass with mention', 'Genius!']" output: "Lower than very bad" code: | - ( with index as ( @@ -75,4 +74,3 @@ code: | select labls[ordinal(ix)] from padded_labels, index - ) diff --git a/bigfunctions/transform/transform_numeric/quantize_into_fixed_width_bins.yaml b/bigfunctions/transform/transform_numeric/quantize_into_fixed_width_bins.yaml index 29fe7ef31..103d3a37f 100644 --- a/bigfunctions/transform/transform_numeric/quantize_into_fixed_width_bins.yaml +++ b/bigfunctions/transform/transform_numeric/quantize_into_fixed_width_bins.yaml @@ -45,7 +45,6 @@ examples: - "10" output: "]100, +∞[" code: | - ( select case when value < min_bound then "]-∞, " || cast(min_bound as string) || "[" @@ -73,4 +72,3 @@ code: | from offsets, bin_width ) end - ) diff --git a/bigfunctions/transform/transform_string/convert_non_ascii_characters_to_unicode_escape_sequences.yaml b/bigfunctions/transform/transform_string/convert_non_ascii_characters_to_unicode_escape_sequences.yaml index cb1031653..54b7c68ed 100644 --- a/bigfunctions/transform/transform_string/convert_non_ascii_characters_to_unicode_escape_sequences.yaml +++ b/bigfunctions/transform/transform_string/convert_non_ascii_characters_to_unicode_escape_sequences.yaml @@ -14,7 +14,6 @@ examples: output: "SCH\\u00dcMANN" region: ALL code: | - ( select array_to_string( array( @@ -23,4 +22,3 @@ code: | ), '' ) - ) diff --git a/bigfunctions/transform/transform_string/ngram_frequency_similarity.yaml b/bigfunctions/transform/transform_string/ngram_frequency_similarity.yaml index 3bf57a67a..649f575f9 100644 --- a/bigfunctions/transform/transform_string/ngram_frequency_similarity.yaml +++ b/bigfunctions/transform/transform_string/ngram_frequency_similarity.yaml @@ -47,7 +47,6 @@ examples: - "4" output: "0.93" code: | - ( with ngrams1 as ( @@ -90,4 +89,3 @@ code: | dot_product / (magnitude1 * magnitude2) ) as similarity from coefficients - ) diff --git a/bigfunctions/transform/transform_string/remove_strings.yaml b/bigfunctions/transform/transform_string/remove_strings.yaml index 448600898..e2d18a95c 100644 --- a/bigfunctions/transform/transform_string/remove_strings.yaml +++ b/bigfunctions/transform/transform_string/remove_strings.yaml @@ -16,7 +16,6 @@ examples: - "['can', 'eat']" output: "I dies" code: | - ( with escaped_strings_to_remove as ( select regexp_replace(str, r'(\.|\+|\*|\?|\^|\$|\(|\)|\[|\]|\{|\}|\||\\)', r'\\\1') as str from unnest(strings_to_remove) as str @@ -26,4 +25,3 @@ code: | '(' || array_to_string((select array_agg(str) from escaped_strings_to_remove), '|') || ')', '' ) - ) diff --git a/bigfunctions/transform/transform_string/remove_words.yaml b/bigfunctions/transform/transform_string/remove_words.yaml index 93ca9628b..e2be32bfa 100644 --- a/bigfunctions/transform/transform_string/remove_words.yaml +++ b/bigfunctions/transform/transform_string/remove_words.yaml @@ -16,4 +16,4 @@ examples: - "['can', 'eat']" output: "I candies" code: | - (select regexp_replace(string, r'\b(' || array_to_string(words_to_remove, '|') || r')\b', '')) + select regexp_replace(string, r'\b(' || array_to_string(words_to_remove, '|') || r')\b', '') diff --git a/bigfunctions/transform/transform_string/url_decode.yaml b/bigfunctions/transform/transform_string/url_decode.yaml index 67297baad..cbb0bac09 100644 --- a/bigfunctions/transform/transform_string/url_decode.yaml +++ b/bigfunctions/transform/transform_string/url_decode.yaml @@ -15,7 +15,6 @@ examples: - "'http%3A%2F%2Fwww.example.com%2Fhello%3Fv%3D12345'" output: "http://www.example.com/hello?v=12345" code: | - ( select string_agg( if(regexp_contains(y, r'^%[0-9a-fA-F]{2}'), safe_convert_bytes_to_string(from_hex(replace(y, '%', ''))), y), '' @@ -23,4 +22,3 @@ code: | ) from unnest(regexp_extract_all(url_encoded_string, r"%[0-9a-fA-F]{2}(?:%[0-9a-fA-F]{2})*|[^%]+")) y with offset as i - ) diff --git a/bigfunctions/utils/timestamp_from_unix_date_time.yaml b/bigfunctions/utils/timestamp_from_unix_date_time.yaml index ea2373483..630a0e2cf 100644 --- a/bigfunctions/utils/timestamp_from_unix_date_time.yaml +++ b/bigfunctions/utils/timestamp_from_unix_date_time.yaml @@ -22,7 +22,6 @@ examples: - '"YEAR"' output: "2001-01-01 00:00:00 UTC" code: | - ( case upper(date_time_part) when 'MICROSECOND' then timestamp_add(timestamp('1970-01-01'), INTERVAL unix_date_time MICROSECOND) when 'MILLISECOND' then timestamp_add(timestamp('1970-01-01'), INTERVAL unix_date_time MILLISECOND) @@ -43,4 +42,3 @@ code: | when 'YEAR' then timestamp(date_add(date('1970-01-01'), INTERVAL unix_date_time YEAR)) else error(date_time_part || ' is not a valid `date_time_part`.') end - ) diff --git a/bigfunctions/utils/timestamp_to_unix_date_time.yaml b/bigfunctions/utils/timestamp_to_unix_date_time.yaml index 623c8cb67..2235a5ab0 100644 --- a/bigfunctions/utils/timestamp_to_unix_date_time.yaml +++ b/bigfunctions/utils/timestamp_to_unix_date_time.yaml @@ -22,7 +22,6 @@ examples: - '"YEAR"' output: "31" code: | - ( case upper(date_time_part) when 'MICROSECOND' then unix_micros(timestamp_expression) when 'MILLISECOND' then unix_millis(timestamp_expression) @@ -43,4 +42,3 @@ code: | when 'YEAR' then date_diff(date(timestamp_expression), '1970-01-01', YEAR) else error(date_time_part || ' is not a valid `date_time_part`.') end - ) diff --git a/docs/community.md b/docs/community.md index 4fbbb9e51..20c2f4e03 100644 --- a/docs/community.md +++ b/docs/community.md @@ -125,7 +125,7 @@ To make a change to this repository: 2. Clone your fork 3. Create a virtual env then install the packages (including dev packages) with `pip install -e .[dev]` 4. Make your changes. -5. Run pre-commit to follow the repo rules using `precommit run` +5. Run pre-commit to follow the repo rules using `pre-commit run` 6. Commit and Push your changes to your fork. 7. [Create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). diff --git a/docs/framework.md b/docs/framework.md index 7aa358c2d..2364c040d 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -242,6 +242,299 @@ select PROJECT.DATASET.faker("name", "it_IT")
+## YAML Syntax + +=== "SQL" + + ```yaml + type: function_sql #(1)! + author: John Doe #(2)! + description: | #(3)! + Multiplies a number by a factor + (example function for documentation purposes) + arguments: #(4)! + - name: num + type: float64 + - name: factor + type: float64 + output: #(5)! + name: product + type: float64 + examples: #(6)! + - description: Basic multiplication + arguments: + - 5 + - 3 + output: 15 + - description: Decimal multiplication + arguments: + - 2.5 + - 4 + output: 10.0 + code: | #(7)! + ( + SELECT num * factor + ) + ``` + + 1. `type` + *(Required)* + Function category declaration. + 2. `author` + *(Optional)* + Function creator/maintainer identifier. + 3. `description` + *(Required)* + Clear explanation of the function's purpose and behavior. + 4. `arguments` + *(Required)* + List of input arguments with BigQuery-compatible types. + **Argument structure**: + - **`name`** + Valid identifier (snake_case recommended) + Example: `user_id`, `transaction_amount` + - **`type`** + BigQuery-supported data type: + ```python + BOOL | INT64 | FLOAT64 | STRING | JSON | DATE | TIMESTAMP + ``` + [BigQuery Data Types Reference :material-arrow-right:](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types) + 5. `output` + *(Required)* + Definition of the function's return value structure. + **Output Structure:** + - **`name`** + Identifier for the return value (snake_case recommended) + Example: `result`, `total_amount` + - **`type`** + BigQuery-compatible data type: + ```python + BOOL | INT64 | FLOAT64 | STRING | JSON | DATE | TIMESTAMP + ``` + **Example:** + ```yaml + output: + name: final_price + type: FLOAT64 + description: Total amount after applying discounts and taxes + ``` + [BigQuery Data Types Reference :material-arrow-right:](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types) + 6. `examples` + *(Required)* + List of practical usage demonstrations for the function. + **Key Elements:** + - **`description`** : Context explanation + - **`arguments`** : Input values + - **`output`** : Expected result + 7. `code` + *(Required)* + SQL query implementation for the function's logic. + + +=== "Python" + + ```yaml + type: function_py #(1)! + author: John Doe #(2)! + description: | #(3) + Generates a personalized greeting message + Combines first and last name with a welcome phrase + arguments: #(4)! + - name: first_name + type: string + - name: last_name + type: string + output: #(5)! + name: greeting + type: string + examples: #(6)! + - description: Basic usage + arguments: + - "'John'" + - "'Doe'" + output: "Hello John Doe" + - description: Different name + arguments: + - "'Marie'" + - "'Curie'" + output: "Hello Marie Curie" + init_code: | #(7)! + # Pre-imported modules (executed once) + import requests # Example dependency + code: | #(9)! + return f"Hello {first_name} {last_name}" + requirements: | #(10)! + # External libraries needed + numpy==1.24.2 + requests>=2.28.1 + dockerfile: | #(8)! + image: python:3.9-slim # Base image + apt_packages: # System dependencies + - libgomp1 + additional_commands: | + # Additional setup commands + RUN pip install --upgrade pip + secrets: | #(14)! + - name: API_KEY + description: External service authentication + documentation_link: https://example.com/api-docs + max_batching_rows: 1 #(11)! + quotas: | #(12)! + max_rows_per_user_per_day: 10000000 # Daily user quota + max_rows_per_query: 2 # Per-query limit + cloud_run: #(13)! + memory: 2Gi + concurrency: 80 # Max concurrent requests/instance + cpu: 2 # vCPU count + ``` + + 1. `type` + *(Required)* + Function category declaration. + 2. `author` + *(Optional)* + Function creator/maintainer identifier. + 3. `description` + *(Required)* + Clear explanation of the function's purpose and behavior. + 4. `arguments` + *(Required)* + List of input arguments with types. + **Argument structure**: + - **`name`** + Valid Python identifier (snake_case recommended) + Example: `user_id`, `transaction_amount` + - **`type`** + Data type from allowed set: + ```python + BOOL | STRING | JSON | INT64 | FLOAT64 + ``` + [Python Type Hints Documentation :material-arrow-right:](https://docs.python.org/3/library/typing.html) + 5. Output structure (name, type). + 6. List of usage examples (description, arguments, output). + 7. `init_code` + *(Optional)* + Initialization code executed once during container startup, before any function invocation. + **Example:** + ```python + # Pre-load expensive dependencies + import requests # HTTP client + import numpy as np # Numerical computations + from google.cloud import bigquery # GCP integration + # Initialize shared resources + client = bigquery.Client() + model = load_ml_model("gs://bucket/model.pkl") # One-time model loading + ``` + **Key Use Cases:** + - Pre-importing expensive modules to reduce per-request latency + - Initializing database connections/pools + - Loading ML models or configuration files + - Setting up shared caches or global variables + 8. `dockerfile` + *(Optional)* + Custom Docker container configuration for function packaging. + By default `uv` python image is used. + **Configurable Elements:** + - **`image`** + Base Docker image (e.g., `python:3.9-slim`). + *Recommendation: Use specific version tags (e.g., `python:3.9.18-slim`)* + - **`apt_packages`** + System packages to install: + ```yaml + apt_packages: + - libgomp1 # OpenMP support + - libpq-dev # PostgreSQL bindings + ``` + - **`additional_commands`** + Custom build commands (executed in order): + ```dockerfile + RUN pip install --upgrade pip + ``` + **⚠️ Important Notes:** + - Prefer official images for security + - Don't modify the default `EXPOSE 8080` + 9. `code` + *(Required)* + Python function implementation containing the core business logic. + **Key Considerations:** + - Arguments defined in the `arguments` section of the yaml are available here in the code. + - Dependencies must be declared in `requirements` section + 10. `requirements` + *(Optional)* + Python packages required by the function, following `requirements.txt` syntax. + **Format:** + ```text + package1==1.2.3 + package2>=4.5.6 + package3 # Comment explaining purpose + ``` + **Example:** + ```python + numpy==1.24.2 + requests>=2.28.1 + google-cloud-storage # For cloud integration + ``` + [Python Packaging Documentation :material-arrow-right:](https://packaging.python.org/en/latest/tutorials/managing-dependencies/) + 11. `max_batching_rows` + *(Optional)* + You can specify `max_batching_rows` as the maximum number of rows in each HTTP request, to avoid Cloud Run functions timeout. If you specify `max_batching_rows`, BigQuery determines the number of rows in a batch up to the `max_batching_rows` limit. If not specified, BigQuery determines the number of rows to batch automatically. [Documentation](https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request). + 12. `quotas` + *(Optional)* + Resource limits to prevent abuse and ensure system stability: + - **`max_rows_per_query`** + Maximum number of rows in a query using the function. + - **`max_rows_per_user_per_day`** + Maximum number of rows per day per user in queries using the function. + 13. *(Optional)* Cloud Run Configuration + Configure scaling, compute resources, and deployment settings for your Cloud Run service. + All arguments from [official Cloud Run documentation :material-arrow-right:](https://cloud.google.com/run/docs/) are suported (we replaced `-` by `_` in arguments name for convention). + Examples of configuration: + ```yaml + # Service Account (default to compute engine service account of your project) + service_account: XXXXXXXXX-compute@developer.gserviceaccount.com + + # Allocated memory per instance (valid: 128Mi to 32Gi, in 64Mi increments) + memory: 512Mi + + # Number of allocated CPUs per instance (default: 1) + cpu: 1 + + # Maximum concurrent requests per instance + concurrency: 8 # Set to 1 for strict isolation + + # Maximum request duration (e.g., 300s = 5 minutes) + timeout: 300s + + # Environment variables (format: KEY1=value1,KEY2=value2) + set_env_vars: DEBUG=true,MAX_RETRIES=3 + + # Minimum number of running instances (avoids cold starts) + min_instances: 1 + + # Maximum number of instances allowed + max_instances: 100 + ``` + 14. *(Optional)* + Secrets + + +=== "Javascript" + + To be documented + +=== "SQL Stored Procedure" + + To be documented + +=== "SQL Aggregate Function" + + To be documented + +=== "Javascript Aggregate Function" + + To be documented + + ## ❓ FAQ ??? note "How to correctly highlight `sql`, `python` and `javascript` code in yaml files?" diff --git a/mkdocs.yml b/mkdocs.yml index ba77a4e13..eef103ebe 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -14,6 +14,7 @@ theme: - tabs - content.tabs.link - content.code.copy + - content.code.annotate # - instant # - toc.integrate # - navigation.instant @@ -24,6 +25,8 @@ theme: - navigation.indexes - navigation.hide # - navigation.expand + + markdown_extensions: - attr_list - md_in_html