Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C#/Java: Content based model generation improvements. #17363

Merged
merged 10 commits into from
Sep 19, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,12 @@ class ContentSet extends TContentSet {
*/
predicate isProperty(Property p) { this = TPropertyContentSet(p) }

/** Holds if this content set represent the field `f`. */
/** Holds if this content set represents the field `f`. */
predicate isField(Field f) { this.isSingleton(TFieldContent(f)) }

/** Holds if this content set represents the synthetic field `s`. */
predicate isSyntheticField(string s) { this.isSingleton(TSyntheticFieldContent(s)) }

/** Holds if this content set represents an element in a collection. */
predicate isElement() { this.isSingleton(TElementContent()) }

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/**
* @name Capture content based summary models.
* @description Finds applicable content based summary models to be used by other queries.
* @kind diagnostic
* @id cs/utils/modelgenerator/contentbased-summary-models
* @tags modelgenerator
*/

import internal.CaptureModels

from DataFlowSummaryTargetApi api, string flow
where flow = captureContentFlow(api)
select flow order by flow
261 changes: 252 additions & 9 deletions csharp/ql/src/utils/modelgenerator/internal/CaptureModels.qll
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ string captureQualifierFlow(DataFlowSummaryTargetApi api) {
api = returnNodeEnclosingCallable(ret) and
isOwnInstanceAccessNode(ret)
) and
result = Printing::asValueModel(api, qualifierString(), "ReturnValue")
result = Printing::asLiftedValueModel(api, qualifierString(), "ReturnValue")
}

private int accessPathLimit0() { result = 2 }
Expand Down Expand Up @@ -237,7 +237,7 @@ string captureThroughFlow0(
input = parameterNodeAsInput(p) and
output = getOutput(returnNodeExt) and
input != output and
result = Printing::asTaintModel(api, input, output)
result = Printing::asLiftedTaintModel(api, input, output)
)
}

Expand Down Expand Up @@ -291,26 +291,269 @@ private string getContent(PropagateContentFlow::AccessPath ap, int i) {
)
}

/**
* Gets the MaD string representation of a store step access path.
*/
private string printStoreAccessPath(PropagateContentFlow::AccessPath ap) {
result = concat(int i | | getContent(ap, i), "" order by i)
}

/**
* Gets the MaD string representation of a read step access path.
*/
private string printReadAccessPath(PropagateContentFlow::AccessPath ap) {
result = concat(int i | | getContent(ap, i), "" order by i desc)
}

string captureContentFlow(DataFlowSummaryTargetApi api) {
/**
* Holds if the access path `ap` contains a field or synthetic field access.
*/
private predicate mentionsField(PropagateContentFlow::AccessPath ap) {
exists(ContentSet head, PropagateContentFlow::AccessPath tail |
head = ap.getHead() and
tail = ap.getTail() and
(mentionsField(tail) or isField(head))
michaelnebel marked this conversation as resolved.
Show resolved Hide resolved
)
}

private predicate apiFlow(
DataFlowSummaryTargetApi api, DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads,
ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores, boolean preservesValue
) {
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and
returnNodeExt.getEnclosingCallable() = api and
p.getEnclosingCallable() = api
}

/**
* A class of APIs relevant for modeling using content flow.
* The following heuristic is applied:
* Content flow is only relevant for an API, if
* #content flow <= 2 * #parameters + 3
* If an API produces more content flow, it is likely that
* 1. Types are not sufficiently constrained leading to a combinatorial
* explosion in dispatch and thus in the generated summaries.
* 2. It is a reasonable approximation to use the non-content based flow
* detection instead, as reads and stores would use a significant
* part of an objects internal state.
*/
private class ContentDataFlowSummaryTargetApi extends DataFlowSummaryTargetApi {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, we could make the restriction per parameter. I.e., if a method has only one summary for Argument[0] then include it, but if it has more than, say 3, summaries for Argument[1] then exclude those.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that could also be interesting.
Would it be acceptable if I do that experiment as a follow up?
As it is now we "only" exclude approximately 2% of all the possible target APIs with this limitation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

ContentDataFlowSummaryTargetApi() {
count(string input, string output |
exists(
DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads,
ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores
|
apiFlow(this, p, reads, returnNodeExt, stores, _) and
input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and
output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores)
)
) <= 2 * this.getNumberOfParameters() + 3
}
}

pragma[nomagic]
private predicate apiContentFlow(
ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p,
PropagateContentFlow::AccessPath reads, ReturnNodeExt returnNodeExt,
PropagateContentFlow::AccessPath stores, boolean preservesValue
) {
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and
returnNodeExt.getEnclosingCallable() = api and
p.getEnclosingCallable() = api
}

/**
* Holds if any of the content sets in `path` translates into a synthetic field.
*/
private predicate hasSyntheticContent(PropagateContentFlow::AccessPath path) {
exists(PropagateContentFlow::AccessPath tail, ContentSet head |
head = path.getHead() and
tail = path.getTail() and
(
michaelnebel marked this conversation as resolved.
Show resolved Hide resolved
exists(getSyntheticName(head)) or
hasSyntheticContent(tail)
)
)
}

/**
* A module containing predicates for validating access paths containing content sets
* that translates into synthetic fields, when used for generated summary models.
*/
private module AccessPathSyntheticValidation {
/**
* Holds if there exists an API that has content flow from `read` (on type `t1`)
* to `store` (on type `t2`).
*/
private predicate step(
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store
) {
exists(DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt |
p.getType() = t1 and
returnNodeExt.getType() = t2 and
apiContentFlow(_, p, read, returnNodeExt, store, _)
)
}

/**
* Holds if there exists an API that has content flow from `read` (on type `t1`)
* to `store` (on type `t2`), where `read` does not have synthetic content and `store` does.
*
* Step A -> Synth.
*/
private predicate synthPathEntry(
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store
) {
not hasSyntheticContent(read) and
hasSyntheticContent(store) and
step(t1, read, t2, store)
}

/**
* Holds if there exists an API that has content flow from `read` (on type `t1`)
* to `store` (on type `t2`), where `read` has synthetic content
* and `store` does not.
*
* Step Synth -> A.
*/
private predicate synthPathExit(
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store
) {
hasSyntheticContent(read) and
not hasSyntheticContent(store) and
step(t1, read, t2, store)
}

/**
* Takes one or more synthetic steps.
* Synth ->+ Synth
*/
private predicate synthPathStepRec(
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store
) {
hasSyntheticContent(read) and
hasSyntheticContent(store) and
(
step(t1, read, t2, store)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we restrict to syntPathEntry here? Otherwise it looks like we will compute O(n^2) paths.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh - this was intended as a helper predicate, but it looks like we need to fold it into the declarations where it is used to avoid the O(n^2).

or
exists(PropagateContentFlow::AccessPath mid, Type midType |
step(t1, read, midType, mid) and synthPathStepRec(midType, mid.reverse(), t2, store)
)
)
}

/**
* Holds if there exists a path of steps from `read` to an exit.
*
* read ->* Synth -> A
*/
private predicate reachesSynthExit(Type t, PropagateContentFlow::AccessPath read) {
synthPathExit(t, read, _, _)
or
exists(PropagateContentFlow::AccessPath mid, Type midType |
synthPathStepRec(t, read, midType, mid) and synthPathExit(midType, mid.reverse(), _, _)
)
}

/**
* Holds if there exists a path of steps from an entry to `store`.
*
* A -> Synth ->* store
*/
private predicate synthEntryReaches(Type t, PropagateContentFlow::AccessPath store) {
synthPathEntry(_, _, t, store)
or
exists(PropagateContentFlow::AccessPath mid, Type midType |
synthPathEntry(_, _, midType, mid) and synthPathStepRec(midType, mid.reverse(), t, store)
)
}

/**
* Holds if at least one of the access paths `read` (on type `t1`) and `store` (on type `t2`)
* contain content that will be translated into a synthetic field, when being used in
* a MaD summary model, and if there is a range of APIs, such that
* when chaining their flow access paths, there exists access paths `A` and `B` where
* A ->* read -> store ->* B and where `A` and `B` do not contain content that will
* be translated into a synthetic field.
*
* This is needed because we don't want to include summaries that reads from or
* stores into a "dead" synthetic field.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"internal" instead of "dead"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! I will add a commit to the re-factor PR that changes this.

*
* Example:
* Assume we have a type `t` (in this case `t1` = `t2`) with methods `getX` and
* `setX`, which gets and sets a private field `X` on `t`.
* This would lead to the following content flows
* getX : Argument[this].SyntheticField[t.X] -> ReturnValue.
* setX : Argument[0] -> Argument[this].SyntheticField[t.X]
* As the reads and stores are on synthetic fields we should only make summaries
* if both of these methods exist.
*/
pragma[nomagic]
predicate acceptReadStore(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it is simply enough to say that we will include a synthetic field if it is part of some input specification and part of some output specification. That should also handle cases such as

setAll // input: Argument[0], output: ReturnValue.SyntheticField[Foo].Element
get // input: Argument[0].SyntheticField[Foo], output: ReturnValue

list = setAll("taint");
x = list[0];
sink(get(x));

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also thought about that as a possible "improvement" (instead of using "path" equality we could "hash" paths with synthetics (basically just print the synthetics in the order they are shown in the path and then compare that)). This would allow path continuations as the one you mention there.
Is it acceptable that we attempt this as a follow up?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually thinking of something even simpler where the order is not taken into account, but yes follow-up is fine.

Copy link
Contributor Author

@michaelnebel michaelnebel Sep 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also try without order; However, I am quite sure that we need the "chaining" logic. It turned out that it is not uncommen that we produce summaries like SyntheticField[A] -> SyntheticField[A] (without other mentions on the synthetic field A) or SyntheticField[A] -> SyntheticField[B] and SyntheticField[B] -> SyntheticField[A], if we don't restrict the use of synthetics.

Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store
) {
synthPathEntry(t1, read, t2, store) and reachesSynthExit(t2, store.reverse())
or
exists(PropagateContentFlow::AccessPath store0 | store0.reverse() = read |
synthEntryReaches(t1, store0) and synthPathExit(t1, read, t2, store)
or
synthEntryReaches(t1, store0) and
step(t1, read, t2, store) and
reachesSynthExit(t2, store.reverse())
)
}
}

/**
* Holds, if the API `api` has relevant flow from `read` on `p` to `store` on `returnNodeExt`.
* Flow is considered relevant,
* 1. If `read` or `store` do not contain a content set that translates into a synthetic field.
* 2. If `read` or `store` contain a content set that translates into a synthetic field, and if
* the synthetic content is "live" on the relevant declaring type.
*/
private predicate apiRelevantContentFlow(
ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p,
PropagateContentFlow::AccessPath read, ReturnNodeExt returnNodeExt,
PropagateContentFlow::AccessPath store, boolean preservesValue
) {
apiContentFlow(api, p, read, returnNodeExt, store, preservesValue) and
(
not hasSyntheticContent(read) and not hasSyntheticContent(store)
or
AccessPathSyntheticValidation::acceptReadStore(p.getType(), read, returnNodeExt.getType(), store)
)
}

pragma[nomagic]
private predicate captureContentFlow0(
Fixed Show fixed Hide fixed
ContentDataFlowSummaryTargetApi api, string input, string output, boolean preservesValue,
boolean lift
) {
exists(
DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, string input, string output,
PropagateContentFlow::AccessPath reads, PropagateContentFlow::AccessPath stores,
boolean preservesValue
DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath reads,
PropagateContentFlow::AccessPath stores
|
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and
returnNodeExt.getEnclosingCallable() = api and
apiRelevantContentFlow(api, p, reads, returnNodeExt, stores, preservesValue) and
input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and
output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) and
input != output and
result = Printing::asModel(api, input, output, preservesValue)
(if mentionsField(reads) or mentionsField(stores) then lift = false else lift = true)
)
}

/**
* Gets the content based summary model(s) of the API `api` (if there is flow from a parameter to
* the return value or a parameter).
*
* Models are lifted to the best type in case the read and store access paths do not
* contain a field or synthetic field access.
*/
string captureContentFlow(ContentDataFlowSummaryTargetApi api) {
exists(string input, string output, boolean lift, boolean preservesValue |
captureContentFlow0(api, input, output, _, lift) and
preservesValue = max(boolean p | captureContentFlow0(api, input, output, p, lift)) and
result = Printing::asModel(api, input, output, preservesValue, lift)
)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,23 +390,47 @@ private string getFullyQualifiedName(Declaration d) {
}

/**
* Gets the MaD string representation of the contentset `c`.
* Holds if the content set `c` is a field, property or synthetic field.
*/
predicate isField(ContentSet c) { c.isField(_) or c.isSyntheticField(_) or c.isProperty(_) }

/**
* Gets the MaD synthetic name string representation for the content set `c`, if any.
*/
string getSyntheticName(DataFlow::ContentSet c) {
exists(CS::Field f |
not f.isEffectivelyPublic() and
c.isField(f) and
result = getFullyQualifiedName(f)
)
or
exists(CS::Property p |
not p.isEffectivelyPublic() and
c.isProperty(p) and
result = getFullyQualifiedName(p)
)
or
c.isSyntheticField(result)
}

/**
* Gets the MaD string representation of the content set `c`.
*/
string printContent(DataFlow::ContentSet c) {
exists(CS::Field f, string name | name = getFullyQualifiedName(f) |
c.isField(f) and
if f.isEffectivelyPublic()
then result = "Field[" + name + "]"
else result = "SyntheticField[" + name + "]"
f.isEffectivelyPublic() and
result = "Field[" + name + "]"
)
or
exists(CS::Property p, string name | name = getFullyQualifiedName(p) |
c.isProperty(p) and
if p.isEffectivelyPublic()
then result = "Property[" + name + "]"
else result = "SyntheticField[" + name + "]"
p.isEffectivelyPublic() and
result = "Property[" + name + "]"
)
or
result = "SyntheticField[" + getSyntheticName(c) + "]"
or
c.isElement() and
result = "Element"
}
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class TypeBasedFlowTargetApi extends Specific::SummaryTargetApi {
output(this, tp, output) and
input != output
|
result = Printing::asValueModel(this, input, output)
result = Printing::asLiftedValueModel(this, input, output)
)
}
}
Expand Down
Loading
Loading