From a1097b7c59480c1afcc567dd19951faf705d6184 Mon Sep 17 00:00:00 2001 From: Aleksey Kravchenko Date: Tue, 24 Dec 2024 18:42:02 +0300 Subject: [PATCH] [#188] Improve content-type detector Signed-off-by: Aleksey Kravchenko --- internal/handler/head.go | 15 +++++++- internal/handler/reader.go | 21 +++++++++-- internal/handler/reader_test.go | 65 +++++++++++++++++++++++++-------- 3 files changed, 81 insertions(+), 20 deletions(-) diff --git a/internal/handler/head.go b/internal/handler/head.go index f2e9f38..da96eff 100644 --- a/internal/handler/head.go +++ b/internal/handler/head.go @@ -45,7 +45,11 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid } req.Response.Header.Set(fasthttp.HeaderContentLength, strconv.FormatUint(obj.PayloadSize(), 10)) - var contentType string + var ( + contentType string + filename string + filepath string + ) for _, attr := range obj.Attributes() { key := attr.Key() val := attr.Value() @@ -69,8 +73,15 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid req.Response.Header.Set(fasthttp.HeaderLastModified, time.Unix(value, 0).UTC().Format(http.TimeFormat)) case object.AttributeContentType: contentType = val + case object.AttributeFilePath: + filepath = val + case object.AttributeFileName: + filename = val } } + if filename == "" { + filename = filepath + } idsToResponse(&req.Response, obj) @@ -85,7 +96,7 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid } return h.frostfs.RangeObject(ctx, prmRange) - }) + }, filename) if err != nil && err != io.EOF { req.handleFrostFSErr(err, start) return diff --git a/internal/handler/reader.go b/internal/handler/reader.go index 50121c9..60067ab 100644 --- a/internal/handler/reader.go +++ b/internal/handler/reader.go @@ -4,9 +4,11 @@ import ( "bytes" "context" "io" + "mime" "net/http" "path" "strconv" + "strings" "time" "git.frostfs.info/TrueCloudLab/frostfs-http-gw/internal/logs" @@ -25,7 +27,7 @@ type readCloser struct { // initializes io.Reader with the limited size and detects Content-Type from it. // Returns r's error directly. Also returns the processed data. -func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (string, []byte, error) { +func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error), filename string) (string, []byte, error) { if maxSize > sizeToDetectType { maxSize = sizeToDetectType } @@ -44,7 +46,20 @@ func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (str buf = buf[:n] - return http.DetectContentType(buf), buf, err // to not lose io.EOF + contentType := http.DetectContentType(buf) + + // Since the detector detects the "text/plain" content type for various types of text files, + // including CSS, JavaScript, and CSV files, + // we'll determine the final content type based on the file's extension. + if strings.HasPrefix(contentType, "text/plain") { + ext := path.Ext(filename) + // If the file doesn't have a file extension, we'll keep the content type as is. + if len(ext) > 0 { + contentType = mime.TypeByExtension(ext) + } + } + + return contentType, buf, err // to not lose io.EOF } type getMultiobjectBodyParams struct { @@ -128,7 +143,7 @@ func (h *Handler) receiveFile(ctx context.Context, req request, objAddress oid.A contentType, payloadHead, err = readContentType(payloadSize, func(uint64) (io.Reader, error) { return payload, nil - }) + }, filename) if err != nil && err != io.EOF { req.log.Error(logs.CouldNotDetectContentTypeFromPayload, zap.Error(err)) response.Error(req.RequestCtx, "could not detect Content-Type from payload: "+err.Error(), fasthttp.StatusBadRequest) diff --git a/internal/handler/reader_test.go b/internal/handler/reader_test.go index c63a734..e143239 100644 --- a/internal/handler/reader_test.go +++ b/internal/handler/reader_test.go @@ -10,39 +10,74 @@ import ( "github.com/stretchr/testify/require" ) +const ( + txtContentType = "text/plain; charset=utf-8" + cssContentType = "text/css; charset=utf-8" + htmlContentType = "text/html; charset=utf-8" + javascriptContentType = "text/javascript; charset=utf-8" + + htmlBody = "Test Html" +) + func TestDetector(t *testing.T) { - txtContentType := "text/plain; charset=utf-8" sb := strings.Builder{} for i := 0; i < 10; i++ { sb.WriteString("Some txt content. Content-Type must be detected properly by detector.") } for _, tc := range []struct { - Name string - ContentType string - Expected string + Name string + ExpectedContentType string + Content string + FileName string }{ { - Name: "less than 512b", - ContentType: txtContentType, - Expected: sb.String()[:256], + Name: "less than 512b", + ExpectedContentType: txtContentType, + Content: sb.String()[:256], + FileName: "test.txt", }, { - Name: "more than 512b", - ContentType: txtContentType, - Expected: sb.String(), + Name: "more than 512b", + ExpectedContentType: txtContentType, + Content: sb.String(), + FileName: "test.txt", + }, + { + Name: "css content type", + ExpectedContentType: cssContentType, + Content: sb.String(), + FileName: "test.css", + }, + { + Name: "javascript content type", + ExpectedContentType: javascriptContentType, + Content: sb.String(), + FileName: "test.js", + }, + { + Name: "html content type by file content", + ExpectedContentType: htmlContentType, + Content: htmlBody, + FileName: "test.detect-by-content", + }, + { + Name: "html content type by file extension", + ExpectedContentType: htmlContentType, + Content: htmlBody, + FileName: "test.html", }, } { t.Run(tc.Name, func(t *testing.T) { - contentType, data, err := readContentType(uint64(len(tc.Expected)), + contentType, data, err := readContentType(uint64(len(tc.Content)), func(uint64) (io.Reader, error) { - return strings.NewReader(tc.Expected), nil - }, + return strings.NewReader(tc.Content), nil + }, tc.FileName, ) require.NoError(t, err) - require.Equal(t, tc.ContentType, contentType) - require.True(t, strings.HasPrefix(tc.Expected, string(data))) + require.Equal(t, tc.ExpectedContentType, contentType) + require.True(t, strings.HasPrefix(tc.Content, string(data))) }) } }