diff --git a/pkg/detectors/azure_openai/azure_openai.go b/pkg/detectors/azure_openai/azure_openai.go new file mode 100644 index 000000000000..9c6e9c890970 --- /dev/null +++ b/pkg/detectors/azure_openai/azure_openai.go @@ -0,0 +1,153 @@ +package azure_openai + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + + regexp "github.com/wasilibs/go-re2" + + logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" +) + +// Scanner detects API keys for Azure's OpenAI service. +// https://learn.microsoft.com/en-us/azure/ai-services/openai/reference +type Scanner struct { + client *http.Client +} + +// Ensure the Scanner satisfies the interface at compile time. +var _ detectors.Detector = (*Scanner)(nil) + +var ( + // TODO: Investigate custom `azure-api.net` endpoints. + // https://github.com/openai/openai-python#microsoft-azure-openai + azureUrlPat = regexp.MustCompile(`(?i)([a-z0-9-]+\.openai\.azure\.com)`) + azureKeyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"api[_.-]?key"}) + `\b(?-i:([a-f0-9]{32}))\b`) +) + +// Keywords are used for efficiently pre-filtering chunks. +// Use identifiers in the secret preferably, or the provider name. +func (s Scanner) Keywords() []string { + return []string{".openai.azure.com"} +} + +// FromData will find and optionally verify OpenAI secrets in a given set of bytes. +func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { + dataStr := string(data) + + // De-duplicate results. + tokens := make(map[string]struct{}) + for _, match := range azureKeyPat.FindAllStringSubmatch(dataStr, -1) { + tokens[match[1]] = struct{}{} + } + if len(tokens) == 0 { + return + } + urls := make(map[string]struct{}) + for _, match := range azureUrlPat.FindAllStringSubmatch(dataStr, -1) { + urls[match[1]] = struct{}{} + } + + // Process results. + logCtx := logContext.AddLogger(ctx) + for token := range tokens { + s1 := detectors.Result{ + DetectorType: s.Type(), + Redacted: token[:3] + "..." + token[25:], + Raw: []byte(token), + } + + for url := range urls { + if verify { + client := s.client + if client == nil { + client = common.SaneHttpClient() + } + + isVerified, extraData, verificationErr := verifyAzureToken(logCtx, client, url, token) + if isVerified || len(urls) == 1 { + s1.RawV2 = []byte(token + ":" + url) + s1.Verified = isVerified + s1.ExtraData = extraData + s1.SetVerificationError(verificationErr, token) + break + } + } + } + + results = append(results, s1) + } + return +} + +func verifyAzureToken(ctx logContext.Context, client *http.Client, baseUrl, token string) (bool, map[string]string, error) { + // TODO: Replace this with a more suitable long-term endpoint. + // Most endpoints require additional info, e.g., deployment name, which complicates verification. + // This may be retired in the future, so we should look for another candidate. + // https://learn.microsoft.com/en-us/answers/questions/1371786/get-azure-openai-deployments-in-api + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("https://%s/openai/deployments?api-version=2023-03-15-preview", baseUrl), nil) + if err != nil { + return false, nil, nil + } + + req.Header.Add("Content-Type", "application/json") + req.Header.Add("api-key", token) + res, err := client.Do(req) + if err != nil { + return false, nil, err + } + defer func() { + _, _ = io.Copy(io.Discard, res.Body) + _ = res.Body.Close() + }() + + switch res.StatusCode { + case http.StatusOK: + body, err := io.ReadAll(res.Body) + if err != nil { + return false, nil, err + } + + var deployments deploymentsResponse + if err := json.Unmarshal(body, &deployments); err != nil { + if json.Valid(body) { + return false, nil, fmt.Errorf("failed to decode response %s: %w", req.URL, err) + } else { + // If the response isn't JSON it's highly unlikely to be valid. + return false, nil, nil + } + } + + // JSON unmarshal doesn't check whether the structure actually matches. + if deployments.Object == "" { + return false, nil, nil + } + + // No extra data available at the moment. + return true, nil, nil + case http.StatusUnauthorized: + return false, nil, nil + default: + return false, nil, fmt.Errorf("unexpected response status %d for %s", res.StatusCode, req.URL) + } +} + +type deploymentsResponse struct { + Data []deployment `json:"data"` + Object string `json:"object"` +} + +type deployment struct { + ID string `json:"id"` +} + +func (s Scanner) Type() detectorspb.DetectorType { + return detectorspb.DetectorType_AzureOpenAI +} diff --git a/pkg/detectors/azure_openai/azure_openai_test.go b/pkg/detectors/azure_openai/azure_openai_test.go new file mode 100644 index 000000000000..db002243adb6 --- /dev/null +++ b/pkg/detectors/azure_openai/azure_openai_test.go @@ -0,0 +1,264 @@ +//go:build detectors +// +build detectors + +package azure_openai + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/trufflesecurity/trufflehog/v3/pkg/common" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" + "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" +) + +func TestAzureOpenAI_Pattern(t *testing.T) { + d := Scanner{} + ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) + tests := []struct { + name string + input string + want []string + }{ + { + name: "Generic environment variables", + input: `export OPENAI_API_VERSION=2023-07-15-preview +export OPENAI_API_TYPE=AZURE +export OPENAI_API_BASE=https://james-test-gpt4.openai.azure.com/ +export OPENAI_API_KEY=3397348fcdcb4a5fbeb6cceb5a6a284f`, + want: []string{"3397348fcdcb4a5fbeb6cceb5a6a284f"}, + }, + { + name: "Generic non-structured", + input: `# {'input': ['This is a test query.'], 'engine': 'text-embedding-ada-002'} +# url /openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01 +# params {'input': ['This is a test query.'], 'encoding_format': 'base64'} +# headers None +# message='Request to OpenAI API' method=post path=https://notebook-openai01.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01 +# api_version=2022-12-01 data='{"input": ["This is a test query."], "encoding_format": "base64"}' message='Post details' +# https://notebook-openai01.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2022-12-01 +# {'X-OpenAI-Client-User-Agent': '{"bindings_version": "0.27.6", "httplib": "requests", "lang": "python", "lang_version": "3.11.2", "platform": "macOS-13.2-arm64-arm-64bit", "publisher": "openai", "uname": "Darwin 22.3.0 Darwin Kernel Version 22.3.0: Thu Jan 5 20:48:54 PST 2023; root:xnu-8792.81.2~2/RELEASE_ARM64_T6000 arm64 arm"}', 'User-Agent': 'OpenAI/v1 PythonBindings/0.27.6', 'api-key': '49eb7c2d3acd41f4ac31fef59ceacbba', 'OpenAI-Debug': 'true', 'Content-Type': 'application/json'}`, + want: []string{"49eb7c2d3acd41f4ac31fef59ceacbba"}, + }, + { + name: "Python", + input: `import openai + +openai.api_key = '1bb7dff73fe449de829363ea03bab134' +openai.api_base = "https://hrcop-openai.openai.azure.com/" +`, + want: []string{"1bb7dff73fe449de829363ea03bab134"}, + }, + { + name: "Python environment variables", + input: `os.environ["OPENAI_API_TYPE"] = "azure" +os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview" +os.environ["OPENAI_API_BASE"] = "https://superhackathonai101-openai.openai.azure.com/" +os.environ["OPENAI_API_KEY"] = '1bb7dde73fe449de229361ea03bab234'`, + want: []string{"1bb7dde73fe449de229361ea03bab234"}, + }, + { + name: "TypeScript", + input: `import OpenAI from "openai"; +export const openai = new OpenAI({ + apiKey: "3375e3ad9a874cd6bd954b6f163be84f", + baseURL: + "https://kumar-azure.openai.azure.com/openai/deployments/ChatAutoUpdate", + defaultQuery: { "api-version": "2023-06-01-preview" }, +});`, + want: []string{"3375e3ad9a874cd6bd954b6f163be84f"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + chunkSpecificDetectors := make(map[ahocorasick.DetectorKey]detectors.Detector, 2) + ahoCorasickCore.PopulateMatchingDetectors(test.input, chunkSpecificDetectors) + if len(chunkSpecificDetectors) == 0 { + t.Errorf("keywords '%v' not matched by: %s", d.Keywords(), test.input) + return + } + + results, err := d.FromData(context.Background(), false, []byte(test.input)) + if err != nil { + t.Errorf("error = %v", err) + return + } + + if len(results) != len(test.want) { + if len(results) == 0 { + t.Errorf("did not receive result") + } else { + t.Errorf("expected %d results, only received %d", len(test.want), len(results)) + } + return + } + + actual := make(map[string]struct{}, len(results)) + for _, r := range results { + if len(r.RawV2) > 0 { + actual[string(r.RawV2)] = struct{}{} + } else { + actual[string(r.Raw)] = struct{}{} + } + } + expected := make(map[string]struct{}, len(test.want)) + for _, v := range test.want { + expected[v] = struct{}{} + } + + if diff := cmp.Diff(expected, actual); diff != "" { + t.Errorf("%s diff: (-want +got)\n%s", test.name, diff) + } + }) + } +} + +func TestAzureOpenAI_FromChunk(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + defer cancel() + testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors5") + if err != nil { + t.Fatalf("could not get test secrets from GCP: %s", err) + } + secret := testSecrets.MustGetField("AZUREOPENAI") + inactiveSecret := testSecrets.MustGetField("AZUREOPENAI_INACTIVE") + + type args struct { + ctx context.Context + data []byte + verify bool + } + tests := []struct { + name string + s Scanner + args args + want []detectors.Result + wantErr bool + wantVerificationErr bool + }{ + { + name: "found, verified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_AzureOpenAI, + Verified: true, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, unverified", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within but not valid", inactiveSecret)), // the secret would satisfy the regex but not pass validation + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_AzureOpenAI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "not found", + s: Scanner{}, + args: args{ + ctx: context.Background(), + data: []byte("You cannot find the secret within"), + verify: true, + }, + want: nil, + wantErr: false, + wantVerificationErr: false, + }, + { + name: "found, would be verified if not for timeout", + s: Scanner{client: common.SaneHttpClientTimeOut(1 * time.Microsecond)}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_AzureOpenAI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + { + name: "found, verified but unexpected api surface", + s: Scanner{client: common.ConstantResponseHttpClient(404, "")}, + args: args{ + ctx: context.Background(), + data: []byte(fmt.Sprintf("You can find a azureopenai secret %s within", secret)), + verify: true, + }, + want: []detectors.Result{ + { + DetectorType: detectorspb.DetectorType_AzureOpenAI, + Verified: false, + }, + }, + wantErr: false, + wantVerificationErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := tt.s.FromData(tt.args.ctx, tt.args.verify, tt.args.data) + if (err != nil) != tt.wantErr { + t.Errorf("Azureopenai.FromData() error = %v, wantErr %v", err, tt.wantErr) + return + } + for i := range got { + if len(got[i].Raw) == 0 { + t.Fatalf("no raw secret present: \n %+v", got[i]) + } + if (got[i].VerificationError() != nil) != tt.wantVerificationErr { + t.Fatalf("wantVerificationError = %v, verification error = %v", tt.wantVerificationErr, got[i].VerificationError()) + } + } + ignoreOpts := cmpopts.IgnoreFields(detectors.Result{}, "Raw", "verificationError") + if diff := cmp.Diff(got, tt.want, ignoreOpts); diff != "" { + t.Errorf("Azureopenai.FromData() %s diff: (-got +want)\n%s", tt.name, diff) + } + }) + } +} + +func BenchmarkFromData(benchmark *testing.B) { + ctx := context.Background() + s := Scanner{} + for name, data := range detectors.MustGetBenchmarkData() { + benchmark.Run(name, func(b *testing.B) { + b.ResetTimer() + for n := 0; n < b.N; n++ { + _, err := s.FromData(ctx, false, data) + if err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/pkg/engine/defaults.go b/pkg/engine/defaults.go index cc9fd589ffd0..0feb1f8ac0ad 100644 --- a/pkg/engine/defaults.go +++ b/pkg/engine/defaults.go @@ -62,6 +62,7 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/aylien" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/ayrshare" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azure" + "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azure_openai" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azurebatch" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azurecontainerregistry" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/azuredevopspersonalaccesstoken" @@ -1616,6 +1617,7 @@ func DefaultDetectors() []detectors.Detector { larksuite.Scanner{}, larksuiteapikey.Scanner{}, endorlabs.Scanner{}, + &azure_openai.Scanner{}, } } diff --git a/pkg/pb/detectorspb/detectors.pb.go b/pkg/pb/detectorspb/detectors.pb.go index 2dd2d42b2697..af3152f228bd 100644 --- a/pkg/pb/detectorspb/detectors.pb.go +++ b/pkg/pb/detectorspb/detectors.pb.go @@ -1093,6 +1093,7 @@ const ( DetectorType_LarkSuite DetectorType = 991 DetectorType_LarkSuiteApiKey DetectorType = 992 DetectorType_EndorLabs DetectorType = 993 + DetectorType_AzureOpenAI DetectorType = 994 ) // Enum value maps for DetectorType. @@ -2088,6 +2089,7 @@ var ( 991: "LarkSuite", 992: "LarkSuiteApiKey", 993: "EndorLabs", + 994: "AzureOpenAI", } DetectorType_value = map[string]int32{ "Alibaba": 0, @@ -3080,6 +3082,7 @@ var ( "LarkSuite": 991, "LarkSuiteApiKey": 992, "EndorLabs": 993, + "AzureOpenAI": 994, } ) @@ -3533,7 +3536,7 @@ var file_detectors_proto_rawDesc = []byte{ 0x4c, 0x41, 0x49, 0x4e, 0x10, 0x01, 0x12, 0x0a, 0x0a, 0x06, 0x42, 0x41, 0x53, 0x45, 0x36, 0x34, 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x54, 0x46, 0x31, 0x36, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x49, 0x43, 0x4f, 0x44, 0x45, - 0x10, 0x04, 0x2a, 0xf0, 0x7e, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54, + 0x10, 0x04, 0x2a, 0x82, 0x7f, 0x0a, 0x0c, 0x44, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x41, 0x6c, 0x69, 0x62, 0x61, 0x62, 0x61, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x41, 0x4d, 0x51, 0x50, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x41, 0x57, 0x53, 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x41, 0x7a, 0x75, 0x72, 0x65, 0x10, 0x03, 0x12, 0x0a, @@ -4548,11 +4551,12 @@ var file_detectors_proto_rawDesc = []byte{ 0x0e, 0x0a, 0x09, 0x4c, 0x61, 0x72, 0x6b, 0x53, 0x75, 0x69, 0x74, 0x65, 0x10, 0xdf, 0x07, 0x12, 0x14, 0x0a, 0x0f, 0x4c, 0x61, 0x72, 0x6b, 0x53, 0x75, 0x69, 0x74, 0x65, 0x41, 0x70, 0x69, 0x4b, 0x65, 0x79, 0x10, 0xe0, 0x07, 0x12, 0x0e, 0x0a, 0x09, 0x45, 0x6e, 0x64, 0x6f, 0x72, 0x4c, 0x61, - 0x62, 0x73, 0x10, 0xe1, 0x07, 0x42, 0x3d, 0x5a, 0x3b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, - 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x73, 0x65, 0x63, 0x75, 0x72, - 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x68, 0x6f, 0x67, 0x2f, 0x76, - 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x70, 0x62, 0x2f, 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x6f, - 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x62, 0x73, 0x10, 0xe1, 0x07, 0x12, 0x10, 0x0a, 0x0b, 0x41, 0x7a, 0x75, 0x72, 0x65, 0x4f, 0x70, + 0x65, 0x6e, 0x41, 0x49, 0x10, 0xe2, 0x07, 0x42, 0x3d, 0x5a, 0x3b, 0x67, 0x69, 0x74, 0x68, 0x75, + 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x73, 0x65, 0x63, + 0x75, 0x72, 0x69, 0x74, 0x79, 0x2f, 0x74, 0x72, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x68, 0x6f, 0x67, + 0x2f, 0x76, 0x33, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x70, 0x62, 0x2f, 0x64, 0x65, 0x74, 0x65, 0x63, + 0x74, 0x6f, 0x72, 0x73, 0x70, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/proto/detectors.proto b/proto/detectors.proto index ac4ae1153976..d0d9194e56fe 100644 --- a/proto/detectors.proto +++ b/proto/detectors.proto @@ -1003,6 +1003,7 @@ enum DetectorType { LarkSuite = 991; LarkSuiteApiKey = 992; EndorLabs = 993; + AzureOpenAI = 994; } message Result {