Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Andre/sync 1 #10534

Draft
wants to merge 7 commits into
base: develop
Choose a base branch
from
Prev Previous commit
Next Next commit
Add annotated target tracing (semgrep/semgrep-proprietary#2237)
This adds a small module that will annotate a list of targets that we're
about to scan with relevant info, and now we will send that off in
traces.

Note that most of this PR is just adding yojson derivers to things. If
we don't like the way I chose to derive some of the edge cases I'm open
to changing them!

## Test plan
```bash
semgrep --pro --trace --config p/default --trace-endpoint semgrep-dev
```
in https://github.com/SigNoz/signoz. relevant trace
[here](https://jaeger-dev2.corp.semgrep.dev/trace/53dd39781d191adb57711cc43f518608?uiFind=b704e985bd5b28c9),
check the `annotated_targets` tag, and hit "copy" not "JSON".

Easiest way to use it is something like pbpaste:
```bash
pbpaste > annotated_targets.json
jq ".[] | select(.stat.textual | not) | .internal_path" annotated_targets.json # get list of non text files
jq "sort_by(.stat.line_count) | .[] | select(.stat.line_count > 4000) | {path: .internal_path, line_count:.stat.line_count}" annotated_targets.json # get list of files w/ line count > 4k
jq ".[] | select(.minified) | .internal_path" annotated_targets.json # get list of minified files
```

tested on elastic search also (>30k targets). Only adds 5s overhead
there. Jaeger is still workable too, and doesn't lag unless manually
exploring the json. Note we will need to wait until @semgrep/infra bumps
some jaeger settings though before traces with >5k targets will actually
be picked up since they're relatively large.

synced from Pro b942b9a1266d7975333a74f03ad1ae3c24def812
  • Loading branch information
ajbt200128 authored and GitHub Actions Bot committed Sep 17, 2024
commit 56f5bd1c5a4ef8d8c209cff2a5c538c09f2a2cd8
41 changes: 34 additions & 7 deletions libs/commons/File_type.ml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type file_type =
| Media of media_type
| Archive of string (* tgz, rpm, etc *)
| Other of string
[@@deriving yojson]

(* programming languages *)
and pl_type =
Expand Down Expand Up @@ -79,20 +80,25 @@ and pl_type =
| IDL of idl_type
| MiscPL of string
| Elixir
[@@deriving yojson]

and config_type =
| Makefile
| Dockerfile
(* note: XML is in webpl_type below *)
| Json
| Jsonnet
| Properties (* Java config *)
| Ignore of string (* any sort of .gitignore *)
| RC of string (* Usually key value, .yarnrc, .npmrc etc. *)
(* kinda pl_type *)
| Yaml
| Terraform
| Sexp (* e.g., dune files *)
| Toml
[@@deriving yojson]

and lisp_type = CommonLisp | Elisp | Scheme | Clojure
and lisp_type = CommonLisp | Elisp | Scheme | Clojure [@@deriving yojson]

and webpl_type =
| Php of string (* php or phpt or script *)
Expand All @@ -102,14 +108,18 @@ and webpl_type =
| Coffee
| Vue
| Css
| Scss
| Html
| Xml
| Opa
| Flash
| Sql
[@@deriving yojson]

and idl_type = Thrift | ATD | Protobuf [@@deriving yojson]

and idl_type = Thrift | ATD | Protobuf
and media_type = Sound of string | Picture of string | Video of string
[@@deriving yojson]

(*****************************************************************************)
(* Main entry point *)
Expand All @@ -120,6 +130,8 @@ and media_type = Sound of string | Picture of string | Video of string
*)
let file_type_of_file file =
let _d, b, e = Filename_.dbe_of_filename_noext_ok !!file in
(* extensions are not case sensitive, at least on windows! *)
let e = String.lowercase_ascii e in
match e with
| "ml"
| "mli"
Expand Down Expand Up @@ -238,6 +250,7 @@ let file_type_of_file file =
| "hack" (* | "hh" *) ->
(* ".hh" is also a popular choice for C++ header files *)
PL (Web Hack)
| "scss" -> PL (Web Scss)
| "css" -> PL (Web Css)
(* "javascript" | "es" | ? *)
| "js" -> PL (Web Js)
Expand All @@ -250,6 +263,7 @@ let file_type_of_file file =
| "htm" ->
PL (Web Html)
| "xml" -> PL (Web Xml)
| "properties" -> Config Properties
| "json" -> Config Json
| "jsonnet"
| "libsonnet" ->
Expand All @@ -271,9 +285,12 @@ let file_type_of_file file =
| "sql3" -> PL (Web Sql)
| "fbobj" -> PL (MiscPL "fbobj")
| "png"
| "psd" (* photoshop *)
| "ai" (* adobe illustrator *)
| "jpg"
| "JPG"
| "gif"
| "svg"
| "tif"
| "tiff" ->
Media (Picture e)
| "xcf"
Expand All @@ -285,11 +302,13 @@ let file_type_of_file file =
Media (Picture e)
| "ppm" -> Media (Picture e)
| "tga" -> Media (Picture e)
| "woff2"
| "ttf"
| "font" ->
Media (Picture e)
| "wav" -> Media (Sound e)
| "swf" -> Media (Picture e)
| "indd" (* indesign document *)
| "ps"
| "pdf" ->
Doc e
Expand Down Expand Up @@ -365,6 +384,7 @@ let file_type_of_file file =
| "clang2" ->
Obj e
(* was Archive *)
| "xlsx" -> Archive e (* excel spreadsheets. They're zip files! *)
| "jar" -> Archive e
| "bz2" -> Archive e
| "gz" -> Archive e
Expand All @@ -374,21 +394,28 @@ let file_type_of_file file =
| "mk" -> Config Makefile
| "rs" -> PL Rust
| "move" -> PL Move_on_aptos
| "go" -> PL Go
| "mod"
| "go" ->
PL Go
| "lua" -> PL Lua
| "r"
| "R" ->
PL R
| "r" -> PL R
| "ex" -> PL Elixir
| _ when UFile.is_executable file -> Binary e
| _ when b = "Makefile" || b = "mkfile" || b = "Imakefile" -> Config Makefile
| _ when b = "Dockerfile" -> Config Dockerfile
| _ when b = "dune" -> Config Sexp
| _ when b = "README" -> Text "txt"
| _ when b = "CODEOWNERS" -> Text "txt"
| _ when b = "LICENSE" -> Text "txt"
| _ when b = "TAGS" -> Binary e
| _ when b = "TARGETS" -> Config Makefile
| _ when b = ".depend" -> Obj "depend"
| _ when b = ".emacs" -> PL (Lisp Elisp)
| _ when b = ".gitattributes" -> Text b
| _ when b = ".gitkeep" -> Text b
| _ when String.starts_with "." b && String.ends_with "ignore" b ->
Config (Ignore b)
| _ when String.starts_with "." b && String.ends_with "rc" b -> Config (RC b)
| _ when UFile.filesize file > 300_000 -> Obj e
| _ -> Other e

Expand Down
14 changes: 12 additions & 2 deletions libs/commons/File_type.mli
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ type file_type =
| Media of media_type
| Archive of string
| Other of string
[@@deriving yojson]

and pl_type =
| OCaml of string
Expand Down Expand Up @@ -45,18 +46,23 @@ and pl_type =
| IDL of idl_type
| MiscPL of string
| Elixir
[@@deriving yojson]

and config_type =
| Makefile
| Dockerfile
| Json
| Jsonnet
| Properties
| Ignore of string
| RC of string
| Yaml
| Terraform
| Sexp
| Toml
[@@deriving yojson]

and lisp_type = CommonLisp | Elisp | Scheme | Clojure
and lisp_type = CommonLisp | Elisp | Scheme | Clojure [@@deriving yojson]

and webpl_type =
| Php of string
Expand All @@ -66,14 +72,18 @@ and webpl_type =
| Coffee
| Vue
| Css
| Scss
| Html
| Xml
| Opa
| Flash
| Sql
[@@deriving yojson]

and idl_type = Thrift | ATD | Protobuf [@@deriving yojson]

and idl_type = Thrift | ATD | Protobuf
and media_type = Sound of string | Picture of string | Video of string
[@@deriving yojson]

(* main entry point *)
val file_type_of_file : Fpath.t -> file_type
Expand Down
10 changes: 10 additions & 0 deletions libs/commons/Fpath_.ml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,13 @@ let current_dir = Fpath.v "."
(* TODO: get rid of! *)
let fake_file : Fpath.t = Fpath.v "_NOT_A_FILE_"
let is_fake_file (f : Fpath.t) : bool = Fpath.equal f fake_file
let to_yojson file = `String (Fpath.to_string file)

let of_yojson = function
| `String path ->
Fpath.of_string path
|> Result.map_error (fun (`Msg error_str) -> error_str)
| other ->
Error
(Printf.sprintf "Expected `String, received %s"
(Yojson.Safe.pretty_to_string other))
2 changes: 2 additions & 0 deletions libs/commons/Fpath_.mli
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
*)
val of_strings : string list -> Fpath.t list
val to_strings : Fpath.t list -> string list
val to_yojson : Fpath.t -> Yojson.Safe.t
val of_yojson : Yojson.Safe.t -> (Fpath.t, string) result

(*
Take a nonempty list of path segments and turn them in to relative path.
Expand Down
28 changes: 28 additions & 0 deletions libs/commons/UFile.ml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,34 @@ end
(* Using Fpath.t *)
(*****************************************************************************)

let file_kind_to_yojson (kind : Unix.file_kind) =
let kind_str =
match kind with
| S_REG -> "S_REG"
| S_DIR -> "S_DIR"
| S_CHR -> "S_CHR"
| S_BLK -> "S_BLK"
| S_LNK -> "S_LNK"
| S_FIFO -> "S_FIFO"
| S_SOCK -> "S_SOCK"
in
`String kind_str

let file_kind_of_yojson (yojson : Yojson.Safe.t) =
match yojson with
| `String "S_REG" -> Ok Unix.S_REG
| `String "S_DIR" -> Ok Unix.S_DIR
| `String "S_CHR" -> Ok Unix.S_CHR
| `String "S_BLK" -> Ok Unix.S_BLK
| `String "S_LNK" -> Ok Unix.S_LNK
| `String "S_FIFO" -> Ok Unix.S_FIFO
| `String "S_SOCK" -> Ok Unix.S_SOCK
| json ->
Error
(Printf.sprintf
"Could not convert to Unix.file_kind expected `String, received %s"
Yojson.Safe.(to_string json))

let files_of_dirs_or_files_no_vcs_nofilter xs =
xs |> Fpath_.to_strings |> Legacy.files_of_dirs_or_files_no_vcs_nofilter
|> Fpath_.of_strings
Expand Down
2 changes: 2 additions & 0 deletions libs/commons/UFile.mli
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ val lfile_exists : Fpath.t -> bool

(* no raised Unix_error if the directory does not exist *)
val dir_exists : Fpath.t -> bool
val file_kind_to_yojson : Unix.file_kind -> Yojson.Safe.t
val file_kind_of_yojson : Yojson.Safe.t -> (Unix.file_kind, string) result

(*****************************************************************************)
(* Filesystem manipulation *)
Expand Down
1 change: 1 addition & 0 deletions libs/commons/dune
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
ppx_deriving.show
ppx_deriving.eq
ppx_deriving.ord
ppx_deriving_yojson
ppx_hash
ppx_sexp_conv
ppx_inline_test
Expand Down
12 changes: 12 additions & 0 deletions libs/git_wrapper/Git_wrapper.ml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ type blob_with_extra = { blob : blob; path : Fpath.t; size : int }
let commit_digest = Commit.digest
let commit_author = Commit.author
let hex_of_hash = Hash.to_hex
let hash_of_hex = Hash.of_hex
let blob_digest = Blob.digest
let string_of_blob = Blob.to_string

Expand All @@ -100,6 +101,17 @@ exception Error of string
(* Helpers *)
(*****************************************************************************)

let hash_to_yojson hash = `String (hex_of_hash hash)

let hash_of_yojson yojson =
match yojson with
| `String hex -> Ok (hash_of_hex hex)
| json ->
Error
(Printf.sprintf
"Could not convert to Hash.t expected `String, received %s"
Yojson.Safe.(to_string json))

(* diff unified format regex:
* https://www.gnu.org/software/diffutils/manual/html_node/Detailed-Unified.html#Detailed-Unified
* The above documentation isn't great, so unified diff format is
Expand Down
2 changes: 1 addition & 1 deletion libs/git_wrapper/Git_wrapper.mli
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ val logs : ?cwd:Fpath.t -> ?since:float option -> < Cap.exec > -> string list
the commits since the specified time.
*)

type hash = Digestif.SHA1.t [@@deriving show, eq, ord]
type hash = Digestif.SHA1.t [@@deriving show, eq, ord, yojson]
type value = hash Git.Value.t [@@deriving show, eq, ord]
type commit = hash Git.Commit.t [@@deriving show, eq, ord]
type author = Git.User.t [@@deriving show, eq, ord]
Expand Down
1 change: 1 addition & 0 deletions libs/git_wrapper/dune
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
ppx_deriving.show
ppx_deriving.eq
ppx_deriving.ord
ppx_deriving_yojson
ppx_sexp_conv
))
)
9 changes: 9 additions & 0 deletions libs/tracing/Tracing.mli
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ val add_data_to_span : span -> (string * Trace_core.user_data) list -> unit
val add_data : (string * Trace_core.user_data) list -> config option -> unit
(** Convenience version of add_data_to_span for Semgrep *)

val trace_data_only :
?level:level ->
__FUNCTION__:string ->
__FILE__:string ->
__LINE__:int ->
string ->
(unit -> (string * Yojson.Safe.t) list) ->
unit

(*****************************************************************************)
(* Entry points for setting up tracing *)
(*****************************************************************************)
Expand Down
1 change: 1 addition & 0 deletions libs/tracing/dune
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
(libraries
uri
trace
yojson
)
(virtual_modules tracing)
(default_implementation tracing.unix)
Expand Down
4 changes: 4 additions & 0 deletions libs/tracing/js/Tracing.ml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ let add_data (_data : (string * Trace_core.user_data) list) (_i : config option)
=
()

let trace_data_only ?(level = Info) ~__FUNCTION__ ~__FILE__ ~__LINE__ name
(f : unit -> (string * Yojson.Safe.t) list) =
()

(*****************************************************************************)
(* Entry points for setting up tracing *)
(*****************************************************************************)
Expand Down
10 changes: 10 additions & 0 deletions libs/tracing/unix/Tracing.ml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,16 @@ let add_data data (tracing_opt : config option) =
tracing.top_level_span
|> Option.iter (fun sp -> Trace_core.add_data_to_span sp data))

let trace_data_only ?(level = Info) ~__FUNCTION__ ~__FILE__ ~__LINE__ name
(f : unit -> (string * Yojson.Safe.t) list) =
with_span ~level ~__FUNCTION__ ~__FILE__ ~__LINE__ name (fun span ->
let data =
f ()
|> List_.map (fun (key, yojson) ->
(key, `String (Yojson.Safe.to_string yojson)))
in
add_data_to_span span data)

(*****************************************************************************)
(* Entry points for setting up tracing *)
(*****************************************************************************)
Expand Down
2 changes: 1 addition & 1 deletion src/rule/Lang.ml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ type t = Language.t =
| Vue
| Xml
| Yaml
[@@deriving show { with_path = false }, eq, hash]
[@@deriving show { with_path = false }, eq, hash, yojson]

let has_tag tag_name =
let tbl = Hashtbl.create 50 in
Expand Down
2 changes: 1 addition & 1 deletion src/rule/Lang.mli
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ type t = Language.t =
| Vue
| Xml
| Yaml
[@@deriving show, eq, hash]
[@@deriving show, eq, hash, yojson]

(* unsupported_language_message [lang] takes the language as a string and
* returns an error message.
Expand Down
2 changes: 1 addition & 1 deletion src/rule/Xlang.ml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type t =
(* generic mode uses either spacegrep or aliengrep *)
| LSpacegrep
| LAliengrep
[@@deriving show, eq, hash]
[@@deriving show, eq, hash, yojson]

exception InternalInvalidLanguage of string (* rule id *) * string (* msg *)

Expand Down
Loading
Loading