Validate file content type for txt, log, JSON file in C#

Surajit Kumar Shah 0 Reputation points
2025-01-08T17:16:17.0733333+00:00

public static bool IsFileValid(IFormFile file)

{

using (var reader = new BinaryReader(file.OpenReadStream()))

{

    var signatures = _fileSignatures.Values.SelectMany(x => x).ToList();  // flatten all signatures to single list

    var headerBytes = reader.ReadBytes(_fileSignatures.Max(m => m.Value.Max(n => n.Length)));

    bool result = signatures.Any(signature => headerBytes.Take(signature.Length).SequenceEqual(signature));

    return result;

}

}

private static readonly Dictionary<string, List<byte[]>> _fileSignatures = new()

{

{ ".gif", new List<byte[]> { new byte[] { 0x47, 0x49, 0x46, 0x38 } } },

{ ".png", new List<byte[]> { new byte[] { 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A } } },

{ ".jpeg", new List<byte[]>

    {

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE0 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE2 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE3 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xEE },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xDB },

    }

},

{ ".jpeg2000", new List<byte[]> { new byte[] { 0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A } } },



{ ".jpg", new List<byte[]>

    {

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE0 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE1 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xE8 },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xEE },

        new byte[] { 0xFF, 0xD8, 0xFF, 0xDB },

    }

},

{ ".zip", new List<byte[]> //also docx, xlsx, pptx, ...

    {

        new byte[] { 0x50, 0x4B, 0x03, 0x04 },

        new byte[] { 0x50, 0x4B, 0x4C, 0x49, 0x54, 0x45 },

        new byte[] { 0x50, 0x4B, 0x53, 0x70, 0x58 },

        new byte[] { 0x50, 0x4B, 0x05, 0x06 },

        new byte[] { 0x50, 0x4B, 0x07, 0x08 },

        new byte[] { 0x57, 0x69, 0x6E, 0x5A, 0x69, 0x70 },

    }

},

{ ".pdf", new List<byte[]> { new byte[] { 0x25, 0x50, 0x44, 0x46 } } },

{ ".z", new List<byte[]>

    {

        new byte[] { 0x1F, 0x9D },

        new byte[] { 0x1F, 0xA0 }

    }

},

{ ".tar", new List<byte[]>

    {

        new byte[] { 0x75, 0x73, 0x74, 0x61, 0x72, 0x00, 0x30 , 0x30 },

        new byte[] { 0x75, 0x73, 0x74, 0x61, 0x72, 0x20, 0x20 , 0x00 },

    }

},

{ ".tar.z", new List<byte[]>

    {

        new byte[] { 0x1F, 0x9D },

        new byte[] { 0x1F, 0xA0 }

    }

},

{ ".tif", new List<byte[]>

    {

        new byte[] { 0x49, 0x49, 0x2A, 0x00 },

        new byte[] { 0x4D, 0x4D, 0x00, 0x2A }

    }

},

{ ".tiff", new List<byte[]>

    {

        new byte[] { 0x49, 0x49, 0x2A, 0x00 },

        new byte[] { 0x4D, 0x4D, 0x00, 0x2A }

    }

},

{ ".rar", new List<byte[]>

    {

        new byte[] { 0x52, 0x61, 0x72, 0x21, 0x1A, 0x07 , 0x00 },

        new byte[] { 0x52, 0x61, 0x72, 0x21, 0x1A, 0x07 , 0x01, 0x00 },

    }

},

{ ".7z", new List<byte[]>

    {

        new byte[] { 0x37, 0x7A, 0xBC, 0xAF, 0x27 , 0x1C },

    }

},

{ ".txt", new List<byte[]>

    {

        new byte[] { 0xEF, 0xBB , 0xBF },

        new byte[] { 0xFF, 0xFE},

        new byte[] { 0xFE, 0xFF },

        new byte[] { 0x00, 0x00, 0xFE, 0xFF },

    }

},

{ ".mp3", new List<byte[]>

    {

        new byte[] { 0xFF, 0xFB },

        new byte[] { 0xFF, 0xF3},

        new byte[] { 0xFF, 0xF2},

        new byte[] { 0x49, 0x44, 0x43},

    }

},

};

Hello I found the above code to validate the file content in c#. This is helpful on validating the content of some known file types. However the logic to compare the signature works fine for some known file type like jpeg, gif, mp3, doc, docx etc. The logic doesn't work for file types like txt, log, JSON. Is there any solution to validate the content type of txt, log, JSON files ? I tried to match the signatures of txt, log, JSON files but it's always different for different files.

Developer technologies | ASP.NET | ASP.NET Core
Microsoft 365 and Office | Development | Office JavaScript API
Developer technologies | .NET | Other
Developer technologies | ASP.NET | Other
Developer technologies | C#
0 comments No comments
{count} votes

3 answers

Sort by: Most helpful
  1. Bruce (SqlWork.com) 79,101 Reputation points Volunteer Moderator
    2025-01-08T17:51:35.44+00:00

    txt and log files have no defined format and can contain any characters, so hard to validate.

    to validate a json file you need to read the entire file to if its valid. if you just want validation the start characters, json file has couple formats:

    • an object, then the file starts with a <whitespace>{ and ends with a }<whitespace>
    • an array, then the file starts with a <whitespace>[ and ends with ]<whitespace>
    • bool value, then <whitespace>true<whitespace> or <whitespace>false<whitespace>
    • null value, then <whitespace>null<whitespace>
    • string value, then <whitespace>" and ends with "<whitespace>
    • numeric value, then <whitespace><numeric string<whitespace>. a numeric string can be an int, hex, decimal, or exponent format.

    see:

    https://www.json.org/json-en.html

    0 comments No comments

  2. SurferOnWww 4,811 Reputation points
    2025-01-09T00:54:31.8266667+00:00

    Is there any solution to validate the content type of txt, log, JSON files ?

    No, there is no practical solution since they are all text file. If they have BOM you will be able to guess that they are text file. However, there will be no way to differentiate among txt, log and JSON files.


  3. Danny Nguyen (WICLOUD CORPORATION) 725 Reputation points Microsoft External Staff
    2025-07-22T09:55:45.36+00:00

    Hi,

    You're correct that validating text-based files like .txt, .log, and .json is fundamentally different from validating binary files using magic numbers/signatures.

    Challenges to address with handling Text Files:

    • Unlike binary formats, text files don't have consistent header signatures.
    • Some UTF-encoded text files may start with a BOM (Byte Order Mark), but this is optional and not always present.
    • Text files can contain any combination of characters, making validation difficult.

    Since JSON has a strict syntax, you can attempt to parse it:

    public static bool IsValidJson(IFormFile file)
    {
        try
        {
            using (var stream = file.OpenReadStream())
            using (var reader = new StreamReader(stream))
            {
                var content = reader.ReadToEnd();
                JsonDocument.Parse(content); // Try to parse
                return true;
            }
        }
        catch
        {
            return false;
        }
    }
    

    For basic text file validation, you could:

    1. Check for BOM (optional)
    2. Check if the content is readable text (not binary):
    public static bool IsLikelyTextFile(IFormFile file)
    {
        using (var stream = file.OpenReadStream())
        using (var reader = new StreamReader(stream))
        {
            // Read first few KB to check
            var buffer = new char[4096];
            var bytesRead = reader.Read(buffer, 0, buffer.Length);
            // Simple check for non-text characters
            for (int i = 0; i < bytesRead; i++)
            {
                if (buffer[i] == '\0') // Null character
                    return false;
                if (char.IsControl(buffer[i]) && 
                    buffer[i] != '\r' && 
                    buffer[i] != '\n' && 
                    buffer[i] != '\t')
                    return false;
            }
            return true;
        }
    }
    

    For .log files, you might need domain-specific validation (checking for timestamps, log levels, etc.) since there's no standard format.

    You could create a validation flow like:

    1. First check binary signatures (for known binary formats)
    2. If no match, check if it's likely a text file
    3. If text, optionally try to determine specific format (JSON, XML, etc.)

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.