TransWikia.com

Javascript Syntax-Highlighter for Java

Code Review Asked on October 27, 2021

Project

I wanted to create a syntax highlighter for Java using JavaScript, HTML and CSS. It uses regular expressions to find the parts that should be highlighted (at the moment: keywords, strings, comments, imports) and then uses HTML-tags to highlight the found parts.

Result

The website looks like this before entering some code:

enter image description here

Example

I’ve used the following java-snippet to test the code:

import java.time.LocalDate; 

public class Person {
    //Local variable for dateOfBirth
    private LocalDate dateOfBirth;    

    public Person(int year, int month, int day) {
        //See API also: https://docs.oracle.com/javase/8/docs/api/java/time/LocalDate.html
        dateOfBirth = LocalDate.of(year, month, day);
        //Keywords (e.g. int) are not highlighted in comments and strings
        System.out.println("Hello (int)");
    }

    /*
     * Getter
     */
    public LocalDate getDateOfBirth() {
        return dateOfBirth;
    }
}

The result looks like this:

enter image description here

Background

This is my first HTML/CSS/JS-project.

Code

var keywordsColor = "#0033cc";
var controlKeywordsColor = "#009933";
var typesKeywordsColor = "#3399ff";
var stringColor = "#ff3300";
var importColor = "#0033cc";
var commentColor = "gray";

var text;

var keywords = ["abstract", "assert", "class", "const", "extends", "false", "final",
"implements", "import", "instanceof", "interface", "native", "new", "null", "package",
"private", "protected", "public", "return", "static", "strictfp", "super", "synchronized",
"System", "this", "throw", "throws", "transient", "true", "volatile"];

var controlKeywords = ["break", "case", "catch", "continue", "default", "do", "else",
"finally", "for", "goto", "if", "switch", "try", "while"];

var typesKeywords = ["boolean", "byte", "char", "double", "enum", "float", "int",
"long", "short", "String", "void"];
var otherKeywords = [];

function highlight() {
text = document.getElementById("Input").value;
highlightKeywords();
highlightStrings();
highlightImports();
highlightSingleLineComments();
highlightMultiLineComments();
addStyles();
document.getElementById("Output").value = text;
document.getElementById("outputArea").innerHTML = text;
}

function highlightKeywords() {
var i;

for (i = 0; i < keywords.length; i++) {
  var x = new RegExp(keywords[i] + " ", "g");
  var y = "<span style='color:" + keywordsColor + ";font-weight:bold;'>" + keywords[i] + " </span>";
  text = text.replace(x, y);
}

for (i = 0; i < controlKeywords.length; i++) {
  var x = new RegExp(controlKeywords[i] + " ", "g");
  var y = "<span style='color:" + controlKeywordsColor + "; font-weight:bold; '>" + controlKeywords[i] + " </span>";
  text = text.replace(x, y);
}

for (i = 0; i < typesKeywords.length; i++) {
  var x = new RegExp(typesKeywords[i] + " ", "g");
  var y = "<span style='color:" + typesKeywordsColor + "; font-weight:bold; '>" + typesKeywords[i] 
      + " </span>";
  text = text.replace(x, y);
}
}

function highlightStrings() {
text = text.replace(/"(.*?)"/g,
     "<span id="str"style='color:" + stringColor + "; font-weight:bold; '>" 
     + ""$1"" + "</span>");
}

function highlightImports() {
text = text.replace(/import(.*?);/g,
     "<span id="str"style='color:" + importColor + "; font-weight:bold; '>" 
     + "import$1;" + "</span>");
}

function highlightSingleLineComments() {
text = text.replace(///(.*)/g,
     "<span id="comment"style='color:" + commentColor + "; font-weight:bold; '>" 
     + "//$1" + "</span>");
}

function highlightMultiLineComments() {
text = text.replace(//*([sS]*?)*//g,
    "<span id="comment"style='color:" + commentColor + "; font-weight:bold; '>" 
     + "/*$1*/" + "</span>");
}

function addStyles() {
text = "<!-- Code begins here -->n<pre><code>n"
    + "<style>#comment span {color:" + commentColor + "!important;}</style>"
    + "<style>#str span {color:" + stringColor + "!important;}</style>" + text
    + "n</code></pre>n<!-- Code ends here -->n";
}
/* Navigation bar style */
.nav ul { 
    background: ForestGreen; /* Sets the background-color */
    list-style: none; /* Removes bullet point */
    overflow: hidden; /* What happens when element is too big for formatting context*/
    padding: 0px; /* padding-area at all four sides of an element */
}

.nav li {
    float: left; /* Move element to the left and add new element on the right side*/
    border-right: 2px solid LightGray;/* Border lines on the right side of each element */
}

.nav a {
    color: black; /* Font color has to be set here, because otherwise it would be a blue hyperlink */
    display: inline-block; /* One box for all elements */
    font-size: large; /* Sets font size to a large size */
    text-decoration: none; /* Removes underline */
    padding: 4px;
}

.nav a:hover {
    background: AliceBlue; /* Changes background of element when user is hovering over it */
}

.nav a.active {
    background: DarkGreen; /* Changes background of current element */
}

/* Other */
#code {
    background: LightGray;
    font: monospace;
}

.column {
  float: left;
  width: 50%;
}
<!DOCTYPE html>
<html>
    <!-- Head -->
    <head>
        <meta charset="utf-8">
        <link rel="stylesheet" type="text/css" href="style.css">
    </head>

    <!-- Navigation bar -->
    <header>
        <div class="nav">
            <ul>
                <li><a class = "active" href="index.html">Home</a></li>
            </ul> 
        </div>
    </header>

    <!-- Body -->
    <body>
        <h2>HTML syntax-highlighting for Java</h2>

        <!-- Left column -->
        <div class="column">

            <!-- Input Area -->
            <h4>Input:</h4>
            <div style = "white-space = pre !important">
                <textarea id="Input" cols="80" rows="8" wrap = "off" style = "resize: none; background: LightGray"></textarea>
            </div>
            <br><br>
            <button type="button" onclick="highlight()">Highlight</button> 

            <!-- Output Area -->
            <h4>Output:</h4>
            <div style = "white-space = pre !important">
                <textarea id="Output" cols="80" rows="8" wrap = "off" style = "resize: none; background: LightGray"></textarea>
            </div>    
            <style>
                document.getElementById("Input").style.whiteSpace = "nowrap"; 
                document.getElementById("Output").style.whiteSpace = "nowrap"; 
            </style>
        </div>

        <!-- Right Column -->
        <div class="column">
            <h4>Preview</h4>
            <div id="outputArea" style="overflow-y:auto; overflow-x:auto; height: 690px">
            </div>
        </div>
    </body>

    <script language = "javascript" type = "text/javascript" src = "highlightSyntax.js"></script>
</html>

Questions

How can this code be improved? Did I make a major mistake in regards to the best-practices of HTML/CSS/JS?

Any suggestions are appreciated.


The follow-up question can be found here.

4 Answers

Interpreting any source code language entirely by regular expression — which is to say, without actually parsing the code and building an understanding of it on a syntactic level — is notoriously difficult to do. Your regular expressions do fall prey to some of the common issues with regexp-as-parser, since it will mis-highlight all of the following:

public class Person {
    private Account my_import_export;
    private Multibyte stupidClassName;
    System.out.println("Hi "friend".");
}

Ensuring that your keywords don't start in the middle of a word would help a lot, and fix the first two. The escaped quotes thing is trickier.

Answered by FeRD on October 27, 2021

Your current approach of highlighting one token type after another will fail for more complicated examples. Imagine this:

String s = "public data, private secrets";

The words in the string are not keywords.

To fix this, you need to change your code to tokenize the input text in a single pass, like this pseudo code:

function tokenize(text) {
    const tokens = [];

    while (text !== '') {
        if (text starts with whitespace)
            tokens.push(['space', leading space]);
        else if (text starts with keyword)
            tokens.push(['keyword.flow', keyword]);
        else if (text starts with string)
            tokens.push(['string', string]);
        else
            error();
        text = text without the current token;
    }
    return tokens;
}

Using this structure, you can correctly parse Java code. Parsing more esoteric languages like Python or Kotlin or even Perl requires even more sophisticated parsers, but Java is a very simple language (on the syntactical level).

Once you have split the text into tokens, generating the highlighted HTML from the tokens is trivial.

Answered by Roland Illig on October 27, 2021

For a beginner this looks like a great start! Below are some suggestions to clean things up and make things more consistent.

It is a common convention for HTML to not contain spaces between attribute keys and values.

So instead of

<script language = "javascript" type = "text/javascript" src = "highlightSyntax.js"></script>

make it simply:

<script language="javascript" type="text/javascript" src="highlightSyntax.js"></script>

And similarly for the <div> that contains the first <textarea>.

While single quotes can be used to delimit the attribute values of HTML, it is best to be consistent and use double quotes - so the JavaScript functions that wrap keywords in HTML can use single-quotes to delimit the strings, which is inline with many style guides (e.g. aibnb, google).

Instead of

var y = "<span style='color:" + typesKeywordsColor + "; font-weight:bold; '>" + typesKeywords[i] 
 + " </span>";

Use single quotes:

var y = '<span style="color:' + typesKeywordsColor + '; font-weight:bold; ">' + typesKeywords[i] 
  + ' </span>';

Template literals could also be used to generate the strings though note the browser support as that may affect the target audience.

var y = `<span style="color:${typesKeywordsColor}; font-weight:bold; ">${typesKeywords[i]} </span>`;

The attribute id must be unique1 so instead of using multiple elements with the same id attribute (e.g. <span id="comment"), use a class name instead. Also, the inline style attributes should be put into CSS because they aren't so dynamic. Thus variables like keywordsColor can be eliminated from the JavaScript. CSS custom properties (variables) could be used if there is a need.

Instead of adding event handlers in HTML, e.g.

<button type="button" onclick="highlight()">Highlight</button> 

It can be done in JavaScript in multiple ways. One way (which many prefer) is to use element.addEventListener() which allows adding multiple handlers to an element. For example, presuming an attribute id="highlightButton" is added to that button (though that isn’t the only way to access that element via JS):

document.getElementById('highlightButton').addEventListener('click', highlight);

This keeps the event handling logic separate from the markup.

There appears to be a style tag with JavaScript in it - I presume the intention was to use script instead. And those styles can be put into the CSS instead (i.e. in style.css). If you were aiming for the styles to be applied at a certain event (e.g. DOMContentLoaded) then apply classes accordingly.

    <style>
           document.getElementById("Input").style.whiteSpace = "nowrap"; 
           document.getElementById("Output").style.whiteSpace = "nowrap"; 
    </style>

In the styles there is one rule for padding on .nav ul:

padding: 0px;

For <length> values "unit is optional after the number 0"2 (and could add confusion3).

I also see this in the HTML:

<div style = "white-space = pre !important">

But note that:

Using `!important, however, is bad practice and should be avoided because it makes debugging more difficult by breaking the natural cascading in your stylesheets.
...
Instead of using !important, consider:

  1. Make better use of the CSS cascade
  2. Use more specific rules. By indicating one or more elements before the element you're selecting, the rule becomes more specific and gets higher priority

4

Answered by Sᴀᴍ Onᴇᴌᴀ on October 27, 2021

There's a glaring problem with this: no user input validation.

A basic test revealed this:

Oops

Ah, but that's not Java you say? True, but what if some poor sod has a stray HTML tag in their docstring?

Centered Java

All HTML that's already in the user input should probably be made harmless. Either escape it, wrap it in a container that will treat it as a string instead of code, anything. Or people will do all kinds of strange things with your site.

The docstring should've been properly rendered instead of cutting out the this-just-happens-to-be-valid-HTML.

/**
 * <center>
 */

That's a valid docstring. It should've been rendered as such.

Another example. The following input:

public final class Solution extends Mightyfine <A, B>
    implements Foo {
        /**
        * <hr>
        */<span id="comment" style='color:gray; font-weight:bold;'>
    }
}

Results in the following output:

<!-- Code begins here -->
<pre><code>
<style>#comment span {color:gray!important;}</style><style>#str span {color:#ff3300!important;}</style><span style='color:#0033cc;font-weight:bold;'>public </span><span style='color:#0033cc;font-weight:bold;'>final </span><span style='color:#0033cc;font-weight:bold;'>class </span>Solution <span style='color:#0033cc;font-weight:bold;'>extends </span>Mightyfine <A, B>
    <span style='color:#0033cc;font-weight:bold;'>implements </span>Foo {
        <span id="comment"style='color:gray; font-weight:bold; '>/**
        * <hr>
        */</span><span id=<span id="str"style='color:#ff3300; font-weight:bold; '>"comment"</span> style='color:gray; font-weight:bold;'>
    }
}
</code></pre>
<!-- Code ends here -->

*/</span><span id=<span id= is going to be interesting to parse for a browser. On my machine, that looks like this:

Java Unhinged

Notice that there's a lot of parts being parsed wrong. For example, where did <A, B> go? It's incorrectly parsed as HTML. The offending characters should either be escaped or simply be parsed differently than they are now. Leaving them unescaped, like how it is done now, will lead to behaviour you don't want.

Answered by Mast on October 27, 2021

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP