diff --git a/client_test.go b/client_test.go index 99ef81e..5c4ef22 100644 --- a/client_test.go +++ b/client_test.go @@ -14,6 +14,7 @@ import ( "net" "net/http" "net/http/httptest" + "net/url" "os" "path" "path/filepath" @@ -1848,6 +1849,366 @@ func TestHTTPClientWithIPv6Disabled(t *testing.T) { } } +func TestHTTPClientPOSTWithTextPayload(t *testing.T) { + var ( + rotatorSettings = defaultRotatorSettings(t) + err error + ) + + // Create a test server that expects POST requests and echoes back the received body + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" { + t.Errorf("Expected POST request, got %s", r.Method) + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + + body, err := io.ReadAll(r.Body) + if err != nil { + t.Errorf("Failed to read request body: %v", err) + w.WriteHeader(http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "text/plain") + w.WriteHeader(http.StatusOK) + w.Write([]byte("Received: ")) + w.Write(body) + })) + defer server.Close() + + // Initialize the WARC-writing HTTP client + httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{RotatorSettings: rotatorSettings}) + if err != nil { + t.Fatalf("Unable to init WARC writing HTTP client: %s", err) + } + waitForErrors := drainErrChan(t, httpClient.ErrChan) + + // Create a POST request with a text payload + requestBody := strings.NewReader("Hello from POST request") + req, err := http.NewRequest("POST", server.URL, requestBody) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "text/plain") + + resp, err := httpClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + + io.Copy(io.Discard, resp.Body) + + httpClient.Close() + waitForErrors() + + files, err := filepath.Glob(rotatorSettings.OutputDirectory + "/*") + if err != nil { + t.Fatal(err) + } + + // Verify the WARC file was created + if len(files) == 0 { + t.Fatal("No WARC files were created") + } + + // Check the WARC records contain the POST request and response + for _, path := range files { + testFileSingleHashCheck(t, path, "sha1:RFV2ZU2BHITF3PW7BSPBQE65GFZS7F5G", []string{"154"}, 1, server.URL+"/") + + file, err := os.Open(path) + if err != nil { + t.Fatalf("failed to open %q: %v", path, err) + } + defer file.Close() + + reader, err := NewReader(file) + if err != nil { + t.Fatalf("warc.NewReader failed for %q: %v", path, err) + } + + foundRequest := false + + for { + record, err := reader.ReadRecord() + if err != nil { + if err == io.EOF { + break + } + t.Fatalf("warc.ReadRecord failed: %v", err) + } + + // Check for request record + if record.Header.Get("WARC-Type") == "request" { + foundRequest = true + record.Content.Seek(0, 0) + content, _ := io.ReadAll(record.Content) + contentStr := string(content) + + // Verify it's a POST request + if !strings.Contains(contentStr, "POST") { + t.Errorf("Request record does not contain POST method") + } + + // Verify the request body is present + if !strings.Contains(contentStr, "Hello from POST request") { + t.Errorf("Request record does not contain the expected request body") + } + } + + record.Content.Close() + } + + if !foundRequest { + t.Error("No request record found in WARC file") + } + } +} + +func TestHTTPClientPOSTWithJSONPayload(t *testing.T) { + var ( + rotatorSettings = defaultRotatorSettings(t) + err error + ) + + // Create a test server that expects POST requests with JSON + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" { + t.Errorf("Expected POST request, got %s", r.Method) + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + + if r.Header.Get("Content-Type") != "application/json" { + t.Errorf("Expected Content-Type: application/json, got %s", r.Header.Get("Content-Type")) + } + + body, err := io.ReadAll(r.Body) + if err != nil { + t.Errorf("Failed to read request body: %v", err) + w.WriteHeader(http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + w.Write([]byte(`{"status":"success","received":`)) + w.Write(body) + w.Write([]byte(`}`)) + })) + defer server.Close() + + // Initialize the WARC-writing HTTP client + httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{RotatorSettings: rotatorSettings}) + if err != nil { + t.Fatalf("Unable to init WARC writing HTTP client: %s", err) + } + waitForErrors := drainErrChan(t, httpClient.ErrChan) + + // Create a POST request with a JSON payload + jsonPayload := `{"name":"test","value":123}` + requestBody := strings.NewReader(jsonPayload) + req, err := http.NewRequest("POST", server.URL, requestBody) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + + io.Copy(io.Discard, resp.Body) + + httpClient.Close() + waitForErrors() + + files, err := filepath.Glob(rotatorSettings.OutputDirectory + "/*") + if err != nil { + t.Fatal(err) + } + + // Verify the WARC file was created + if len(files) == 0 { + t.Fatal("No WARC files were created") + } + + // Check the WARC records contain the POST request with JSON body + for _, path := range files { + testFileSingleHashCheck(t, path, "sha1:IAKLOIOTQX2W7PAAWWA2TELLU5HCKO3V", []string{"191"}, 1, server.URL+"/") + + file, err := os.Open(path) + if err != nil { + t.Fatalf("failed to open %q: %v", path, err) + } + defer file.Close() + + reader, err := NewReader(file) + if err != nil { + t.Fatalf("warc.NewReader failed for %q: %v", path, err) + } + + foundJSONRequest := false + + for { + record, err := reader.ReadRecord() + if err != nil { + if err == io.EOF { + break + } + t.Fatalf("warc.ReadRecord failed: %v", err) + } + + // Check for request record + if record.Header.Get("WARC-Type") == "request" { + record.Content.Seek(0, 0) + content, _ := io.ReadAll(record.Content) + contentStr := string(content) + + // Verify it's a POST request + if !strings.Contains(contentStr, "POST") { + t.Errorf("Request record does not contain POST method") + } + + // Verify the JSON payload is present + if strings.Contains(contentStr, jsonPayload) { + foundJSONRequest = true + } + } + + record.Content.Close() + } + + if !foundJSONRequest { + t.Error("JSON payload not found in request record") + } + } +} + +func TestHTTPClientPOSTWithFormData(t *testing.T) { + var ( + rotatorSettings = defaultRotatorSettings(t) + err error + ) + + // Create a test server that expects POST requests with form data + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" { + t.Errorf("Expected POST request, got %s", r.Method) + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + + err := r.ParseForm() + if err != nil { + t.Errorf("Failed to parse form: %v", err) + w.WriteHeader(http.StatusBadRequest) + return + } + + username := r.FormValue("username") + password := r.FormValue("password") + + w.Header().Set("Content-Type", "text/plain") + w.WriteHeader(http.StatusOK) + w.Write([]byte("Login attempt for user: " + username + " (password length: " + strconv.Itoa(len(password)) + ")")) + })) + defer server.Close() + + // Initialize the WARC-writing HTTP client + httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{RotatorSettings: rotatorSettings}) + if err != nil { + t.Fatalf("Unable to init WARC writing HTTP client: %s", err) + } + waitForErrors := drainErrChan(t, httpClient.ErrChan) + + // Create a POST request with form data + formData := url.Values{} + formData.Set("username", "testuser") + formData.Set("password", "testpass123") + + req, err := http.NewRequest("POST", server.URL, strings.NewReader(formData.Encode())) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + + resp, err := httpClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + + io.Copy(io.Discard, resp.Body) + + httpClient.Close() + waitForErrors() + + files, err := filepath.Glob(rotatorSettings.OutputDirectory + "/*") + if err != nil { + t.Fatal(err) + } + + // Verify the WARC file was created + if len(files) == 0 { + t.Fatal("No WARC files were created") + } + + // Check the WARC records contain the POST request with form data + for _, path := range files { + testFileSingleHashCheck(t, path, "sha1:DGXE2J6TLUT3GYLTA2LNA4NQMMPF5SWX", []string{"175"}, 1, server.URL+"/") + + file, err := os.Open(path) + if err != nil { + t.Fatalf("failed to open %q: %v", path, err) + } + defer file.Close() + + reader, err := NewReader(file) + if err != nil { + t.Fatalf("warc.NewReader failed for %q: %v", path, err) + } + + foundFormRequest := false + + for { + record, err := reader.ReadRecord() + if err != nil { + if err == io.EOF { + break + } + t.Fatalf("warc.ReadRecord failed: %v", err) + } + + // Check for request record + if record.Header.Get("WARC-Type") == "request" { + record.Content.Seek(0, 0) + content, _ := io.ReadAll(record.Content) + contentStr := string(content) + + // Verify it's a POST request + if !strings.Contains(contentStr, "POST") { + t.Errorf("Request record does not contain POST method") + } + + // Verify the form data is present (URL-encoded) + if strings.Contains(contentStr, "username=testuser") && strings.Contains(contentStr, "password=testpass123") { + foundFormRequest = true + } + } + + record.Content.Close() + } + + if !foundFormRequest { + t.Error("Form data not found in request record") + } + } +} + // MARK: Benchmarks func BenchmarkConcurrentUnder2MB(b *testing.B) { var ( diff --git a/dedupe.go b/dedupe.go index ba1b36b..c93cb94 100644 --- a/dedupe.go +++ b/dedupe.go @@ -51,7 +51,7 @@ func (d *customDialer) checkLocalRevisit(digest string) revisitRecord { func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) { // CDX expects no hash header. For now we need to strip it. digest = strings.SplitN(digest, ":", 2)[1] - + req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&limit=-1", nil) if err != nil { return revisitRecord{}, err @@ -95,7 +95,7 @@ func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie stri func checkDoppelgangerRevisit(DoppelgangerHost string, digest string, targetURI string) (revisitRecord, error) { // Doppelganger is not expecting a hash header either but this will all be rewritten ... shortly... digest = strings.SplitN(digest, ":", 2)[1] - + req, err := http.NewRequest("GET", DoppelgangerHost+"/api/records/"+digest+"?uri="+targetURI, nil) if err != nil { return revisitRecord{}, err diff --git a/gzip_interface.go b/gzip_interface.go index 4ca2fa7..84ceb0f 100644 --- a/gzip_interface.go +++ b/gzip_interface.go @@ -19,4 +19,4 @@ type GzipReaderInterface interface { io.ReadCloser Multistream(enable bool) Reset(r io.Reader) error -} \ No newline at end of file +} diff --git a/write.go b/write.go index a173856..8073672 100644 --- a/write.go +++ b/write.go @@ -8,8 +8,8 @@ import ( "strings" "time" - "github.com/internetarchive/gowarc/pkg/spooledtempfile" "github.com/google/uuid" + "github.com/internetarchive/gowarc/pkg/spooledtempfile" "github.com/klauspost/compress/zstd" )