[#188] Improve content-type detector

Signed-off-by: Aleksey Kravchenko <al.kravchenko@yadro.com>
2024-12-24 18:42:02 +03:00 · 2024-12-24 18:42:02 +03:00 · a1097b7c59
commit a1097b7c59
parent a4e3767d4b
3 changed files with 81 additions and 20 deletions
--- a/internal/handler/head.go
+++ b/internal/handler/head.go
@ -45,7 +45,11 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
 	}

 	req.Response.Header.Set(fasthttp.HeaderContentLength, strconv.FormatUint(obj.PayloadSize(), 10))
-	var contentType string
+	var (
+		contentType string
+		filename    string
+		filepath    string
+	)
 	for _, attr := range obj.Attributes() {
 		key := attr.Key()
 		val := attr.Value()
@ -69,8 +73,15 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
 			req.Response.Header.Set(fasthttp.HeaderLastModified, time.Unix(value, 0).UTC().Format(http.TimeFormat))
 		case object.AttributeContentType:
 			contentType = val
+		case object.AttributeFilePath:
+			filepath = val
+		case object.AttributeFileName:
+			filename = val
 		}
 	}
+	if filename == "" {
+		filename = filepath
+	}

 	idsToResponse(&req.Response, obj)

@ -85,7 +96,7 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
 			}

 			return h.frostfs.RangeObject(ctx, prmRange)
-		})
+		}, filename)
 		if err != nil && err != io.EOF {
 			req.handleFrostFSErr(err, start)
 			return
--- a/internal/handler/reader.go
+++ b/internal/handler/reader.go
@ -4,9 +4,11 @@ import (
 	"bytes"
 	"context"
 	"io"
+	"mime"
 	"net/http"
 	"path"
 	"strconv"
+	"strings"
 	"time"

 	"git.frostfs.info/TrueCloudLab/frostfs-http-gw/internal/logs"
@ -25,7 +27,7 @@ type readCloser struct {

 // initializes io.Reader with the limited size and detects Content-Type from it.
 // Returns r's error directly. Also returns the processed data.
-func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (string, []byte, error) {
+func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error), filename string) (string, []byte, error) {
 	if maxSize > sizeToDetectType {
 		maxSize = sizeToDetectType
 	}
@ -44,7 +46,20 @@ func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (str

 	buf = buf[:n]

-	return http.DetectContentType(buf), buf, err // to not lose io.EOF
+	contentType := http.DetectContentType(buf)
+
+	// Since the detector detects the "text/plain" content type for various types of text files,
+	// including CSS, JavaScript, and CSV files,
+	// we'll determine the final content type based on the file's extension.
+	if strings.HasPrefix(contentType, "text/plain") {
+		ext := path.Ext(filename)
+		// If the file doesn't have a file extension, we'll keep the content type as is.
+		if len(ext) > 0 {
+			contentType = mime.TypeByExtension(ext)
+		}
+	}
+
+	return contentType, buf, err // to not lose io.EOF
 }

 type getMultiobjectBodyParams struct {
@ -128,7 +143,7 @@ func (h *Handler) receiveFile(ctx context.Context, req request, objAddress oid.A

 		contentType, payloadHead, err = readContentType(payloadSize, func(uint64) (io.Reader, error) {
 			return payload, nil
-		})
+		}, filename)
 		if err != nil && err != io.EOF {
 			req.log.Error(logs.CouldNotDetectContentTypeFromPayload, zap.Error(err))
 			response.Error(req.RequestCtx, "could not detect Content-Type from payload: "+err.Error(), fasthttp.StatusBadRequest)
--- a/internal/handler/reader_test.go
+++ b/internal/handler/reader_test.go
@ -10,39 +10,74 @@ import (
 	"github.com/stretchr/testify/require"
 )

+const (
+	txtContentType        = "text/plain; charset=utf-8"
+	cssContentType        = "text/css; charset=utf-8"
+	htmlContentType       = "text/html; charset=utf-8"
+	javascriptContentType = "text/javascript; charset=utf-8"
+
+	htmlBody = "<!DOCTYPE html><html ><head><meta charset=\"utf-8\"><title>Test Html</title>"
+)
+
 func TestDetector(t *testing.T) {
-	txtContentType := "text/plain; charset=utf-8"
 	sb := strings.Builder{}
 	for i := 0; i < 10; i++ {
 		sb.WriteString("Some txt content. Content-Type must be detected properly by detector.")
 	}

 	for _, tc := range []struct {
-		Name        string
-		ContentType string
-		Expected    string
+		Name                string
+		ExpectedContentType string
+		Content             string
+		FileName            string
 	}{
 		{
-			Name:        "less than 512b",
-			ContentType: txtContentType,
-			Expected:    sb.String()[:256],
+			Name:                "less than 512b",
+			ExpectedContentType: txtContentType,
+			Content:             sb.String()[:256],
+			FileName:            "test.txt",
 		},
 		{
-			Name:        "more than 512b",
-			ContentType: txtContentType,
-			Expected:    sb.String(),
+			Name:                "more than 512b",
+			ExpectedContentType: txtContentType,
+			Content:             sb.String(),
+			FileName:            "test.txt",
+		},
+		{
+			Name:                "css content type",
+			ExpectedContentType: cssContentType,
+			Content:             sb.String(),
+			FileName:            "test.css",
+		},
+		{
+			Name:                "javascript content type",
+			ExpectedContentType: javascriptContentType,
+			Content:             sb.String(),
+			FileName:            "test.js",
+		},
+		{
+			Name:                "html content type by file content",
+			ExpectedContentType: htmlContentType,
+			Content:             htmlBody,
+			FileName:            "test.detect-by-content",
+		},
+		{
+			Name:                "html content type by file extension",
+			ExpectedContentType: htmlContentType,
+			Content:             htmlBody,
+			FileName:            "test.html",
 		},
 	} {
 		t.Run(tc.Name, func(t *testing.T) {
-			contentType, data, err := readContentType(uint64(len(tc.Expected)),
+			contentType, data, err := readContentType(uint64(len(tc.Content)),
 				func(uint64) (io.Reader, error) {
-					return strings.NewReader(tc.Expected), nil
-				},
+					return strings.NewReader(tc.Content), nil
+				}, tc.FileName,
 			)

 			require.NoError(t, err)
-			require.Equal(t, tc.ContentType, contentType)
-			require.True(t, strings.HasPrefix(tc.Expected, string(data)))
+			require.Equal(t, tc.ExpectedContentType, contentType)
+			require.True(t, strings.HasPrefix(tc.Content, string(data)))
 		})
 	}
 }