[#188] Improve content-type detector
All checks were successful
/ DCO (pull_request) Successful in 2m58s
/ Vulncheck (pull_request) Successful in 3m14s
/ Builds (pull_request) Successful in 4m1s
/ Lint (pull_request) Successful in 4m8s
/ Tests (pull_request) Successful in 3m36s

Signed-off-by: Aleksey Kravchenko <al.kravchenko@yadro.com>
This commit is contained in:
Aleksey Kravchenko 2024-12-24 18:42:02 +03:00
parent a4e3767d4b
commit a1097b7c59
3 changed files with 81 additions and 20 deletions

View file

@ -45,7 +45,11 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
} }
req.Response.Header.Set(fasthttp.HeaderContentLength, strconv.FormatUint(obj.PayloadSize(), 10)) req.Response.Header.Set(fasthttp.HeaderContentLength, strconv.FormatUint(obj.PayloadSize(), 10))
var contentType string var (
contentType string
filename string
filepath string
)
for _, attr := range obj.Attributes() { for _, attr := range obj.Attributes() {
key := attr.Key() key := attr.Key()
val := attr.Value() val := attr.Value()
@ -69,8 +73,15 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
req.Response.Header.Set(fasthttp.HeaderLastModified, time.Unix(value, 0).UTC().Format(http.TimeFormat)) req.Response.Header.Set(fasthttp.HeaderLastModified, time.Unix(value, 0).UTC().Format(http.TimeFormat))
case object.AttributeContentType: case object.AttributeContentType:
contentType = val contentType = val
case object.AttributeFilePath:
filepath = val
case object.AttributeFileName:
filename = val
} }
} }
if filename == "" {
filename = filepath
}
idsToResponse(&req.Response, obj) idsToResponse(&req.Response, obj)
@ -85,7 +96,7 @@ func (h *Handler) headObject(ctx context.Context, req request, objectAddress oid
} }
return h.frostfs.RangeObject(ctx, prmRange) return h.frostfs.RangeObject(ctx, prmRange)
}) }, filename)
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
req.handleFrostFSErr(err, start) req.handleFrostFSErr(err, start)
return return

View file

@ -4,9 +4,11 @@ import (
"bytes" "bytes"
"context" "context"
"io" "io"
"mime"
"net/http" "net/http"
"path" "path"
"strconv" "strconv"
"strings"
"time" "time"
"git.frostfs.info/TrueCloudLab/frostfs-http-gw/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-http-gw/internal/logs"
@ -25,7 +27,7 @@ type readCloser struct {
// initializes io.Reader with the limited size and detects Content-Type from it. // initializes io.Reader with the limited size and detects Content-Type from it.
// Returns r's error directly. Also returns the processed data. // Returns r's error directly. Also returns the processed data.
func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (string, []byte, error) { func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error), filename string) (string, []byte, error) {
if maxSize > sizeToDetectType { if maxSize > sizeToDetectType {
maxSize = sizeToDetectType maxSize = sizeToDetectType
} }
@ -44,7 +46,20 @@ func readContentType(maxSize uint64, rInit func(uint64) (io.Reader, error)) (str
buf = buf[:n] buf = buf[:n]
return http.DetectContentType(buf), buf, err // to not lose io.EOF contentType := http.DetectContentType(buf)
// Since the detector detects the "text/plain" content type for various types of text files,
// including CSS, JavaScript, and CSV files,
// we'll determine the final content type based on the file's extension.
if strings.HasPrefix(contentType, "text/plain") {
ext := path.Ext(filename)
// If the file doesn't have a file extension, we'll keep the content type as is.
if len(ext) > 0 {
contentType = mime.TypeByExtension(ext)
}
}
return contentType, buf, err // to not lose io.EOF
} }
type getMultiobjectBodyParams struct { type getMultiobjectBodyParams struct {
@ -128,7 +143,7 @@ func (h *Handler) receiveFile(ctx context.Context, req request, objAddress oid.A
contentType, payloadHead, err = readContentType(payloadSize, func(uint64) (io.Reader, error) { contentType, payloadHead, err = readContentType(payloadSize, func(uint64) (io.Reader, error) {
return payload, nil return payload, nil
}) }, filename)
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
req.log.Error(logs.CouldNotDetectContentTypeFromPayload, zap.Error(err)) req.log.Error(logs.CouldNotDetectContentTypeFromPayload, zap.Error(err))
response.Error(req.RequestCtx, "could not detect Content-Type from payload: "+err.Error(), fasthttp.StatusBadRequest) response.Error(req.RequestCtx, "could not detect Content-Type from payload: "+err.Error(), fasthttp.StatusBadRequest)

View file

@ -10,8 +10,16 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
const (
txtContentType = "text/plain; charset=utf-8"
cssContentType = "text/css; charset=utf-8"
htmlContentType = "text/html; charset=utf-8"
javascriptContentType = "text/javascript; charset=utf-8"
htmlBody = "<!DOCTYPE html><html ><head><meta charset=\"utf-8\"><title>Test Html</title>"
)
func TestDetector(t *testing.T) { func TestDetector(t *testing.T) {
txtContentType := "text/plain; charset=utf-8"
sb := strings.Builder{} sb := strings.Builder{}
for i := 0; i < 10; i++ { for i := 0; i < 10; i++ {
sb.WriteString("Some txt content. Content-Type must be detected properly by detector.") sb.WriteString("Some txt content. Content-Type must be detected properly by detector.")
@ -19,30 +27,57 @@ func TestDetector(t *testing.T) {
for _, tc := range []struct { for _, tc := range []struct {
Name string Name string
ContentType string ExpectedContentType string
Expected string Content string
FileName string
}{ }{
{ {
Name: "less than 512b", Name: "less than 512b",
ContentType: txtContentType, ExpectedContentType: txtContentType,
Expected: sb.String()[:256], Content: sb.String()[:256],
FileName: "test.txt",
}, },
{ {
Name: "more than 512b", Name: "more than 512b",
ContentType: txtContentType, ExpectedContentType: txtContentType,
Expected: sb.String(), Content: sb.String(),
FileName: "test.txt",
},
{
Name: "css content type",
ExpectedContentType: cssContentType,
Content: sb.String(),
FileName: "test.css",
},
{
Name: "javascript content type",
ExpectedContentType: javascriptContentType,
Content: sb.String(),
FileName: "test.js",
},
{
Name: "html content type by file content",
ExpectedContentType: htmlContentType,
Content: htmlBody,
FileName: "test.detect-by-content",
},
{
Name: "html content type by file extension",
ExpectedContentType: htmlContentType,
Content: htmlBody,
FileName: "test.html",
}, },
} { } {
t.Run(tc.Name, func(t *testing.T) { t.Run(tc.Name, func(t *testing.T) {
contentType, data, err := readContentType(uint64(len(tc.Expected)), contentType, data, err := readContentType(uint64(len(tc.Content)),
func(uint64) (io.Reader, error) { func(uint64) (io.Reader, error) {
return strings.NewReader(tc.Expected), nil return strings.NewReader(tc.Content), nil
}, }, tc.FileName,
) )
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, tc.ContentType, contentType) require.Equal(t, tc.ExpectedContentType, contentType)
require.True(t, strings.HasPrefix(tc.Expected, string(data))) require.True(t, strings.HasPrefix(tc.Content, string(data)))
}) })
} }
} }