From d461a7211f0df57f26735939703c7466e3274ccd Mon Sep 17 00:00:00 2001 From: Martin Goik <goik@hdm-stuttgart.de> Date: Wed, 18 Jan 2023 15:31:46 +0100 Subject: [PATCH] Hash value clashes --- Doc/Sd1/CoreClasses/coreClasses.xml | 107 ++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 28 deletions(-) diff --git a/Doc/Sd1/CoreClasses/coreClasses.xml b/Doc/Sd1/CoreClasses/coreClasses.xml index 9ae0e8da4..de66a8e2e 100644 --- a/Doc/Sd1/CoreClasses/coreClasses.xml +++ b/Doc/Sd1/CoreClasses/coreClasses.xml @@ -526,17 +526,18 @@ public boolean equals(Object anObject) { <para><code language="java">true == a.equals(b)</code> ⟹ <code language="java">a.hashCode() == b.hashCode()</code>.</para> - <para>An ideal/perfect <methodname>hashCode()</methodname> method - in addition will return different values whenever two instances - <code>a</code> and <code>b</code> differ in value with respect to - the underlying <methodname>equals()</methodname> method:</para> + <para>A so called <emphasis>perfect</emphasis> + <methodname>hashCode()</methodname> method in addition will return + different values whenever two instances <code>a</code> and + <code>b</code> differ in value with respect to the underlying + <methodname>equals()</methodname> method:</para> <para><code language="java">false == a.equals(b)</code> ⟺ <code language="java">a.hashCode() != b.hashCode()</code>.</para> - <para>Combining these two statements a perfect hashCode() method - will have the following property with respect to its corresponding - <methodname>equals()</methodname> method:</para> + <para>Combining these two statements a + <emphasis>perfect</emphasis> hashCode() method will have the + following property:</para> <para><code language="java">a.equals(b) == (a.hashCode() == b.hashCode())</code></para> @@ -565,6 +566,23 @@ public boolean equals(Object anObject) { So method 2 requiring just two additions offers (slightly) better runtime performance at the expense of a higher hash value collision rate.</para> + + <note> + <para>Perfect hash functions are rare with respect to real world + modeling problems. In the current example + <classname>Timeperiod</classname> instances are limited by 24 + hours, 59 minutes and 59 seconds. This limit is equal to 89999 + seconds fitting well into the count of <inlineequation> + <m:math display="inline"> + <m:msup> + <m:mi>2</m:mi> + + <m:mi>32</m:mi> + </m:msup> + </m:math> + </inlineequation> different <code language="java">int</code> + values.</para> + </note> </answer> </qandaentry> </qandadiv> @@ -577,7 +595,7 @@ public boolean equals(Object anObject) { <qandadiv> <qandaentry> <question> - <para>In the previous exercise we found an ideal + <para>In the previous exercise we found a perfect <methodname>hashCode()</methodname> implementation:</para> <programlisting language="java">public class TimePeriod { @@ -588,7 +606,9 @@ public boolean equals(Object anObject) { } }</programlisting> - <para>Is this possible for instances of String as well?</para> + <para>Does a perfect hash function exist for <link + xlink:href="https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/String.html">String</link> + instances as well?</para> <tip> <para>Consider the possible number of different strings.</para> @@ -596,19 +616,27 @@ public boolean equals(Object anObject) { </question> <answer> - <para>It is not possible to construct a perfect - <methodname>hashCode()</methodname> method acting on arbitrary - strings. A Java <classname + <para>It is not possible to construct a perfect <link + xlink:href="https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/Object.html#hashCode()">hashCode()</link> + method acting on strings. A Java <classname xlink:href="https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/String.html">String</classname> consists of individual <code language="java">char</code> elements - each requiring two bytes. Considering strings of fixed length we - have the following number of different strings:</para> + each requiring two bytes representing <inlineequation> + <m:math display="inline"> + <m:msup> + <m:mi>2</m:mi> + + <m:mi>16</m:mi> + </m:msup> + </m:math> + </inlineequation> different characters. Depending on a string's + length we have:</para> <informaltable border="1"> <tr> - <th>Number of chars</th> + <th>String length (number of characters)</th> - <th>Number of possible strings</th> + <th>Number of different strings</th> </tr> <tr> @@ -666,22 +694,45 @@ public boolean equals(Object anObject) { </tr> </informaltable> - <para>A four byte <code language="java">int</code> only offers - <inlineequation> + <para>Thus considering just the union of zero (empty), one- and + two character strings we have <inlineequation> + <m:math display="inline"> + <m:mrow> + <m:mi>1</m:mi> + + <m:mo>+</m:mo> + + <m:msup> + <m:mi>2</m:mi> + + <m:mi>16</m:mi> + </m:msup> + + <m:mo>+</m:mo> + + <m:msup> + <m:mi>2</m:mi> + + <m:mi>32</m:mi> + </m:msup> + </m:mrow> + </m:math> + </inlineequation> possibilities exceeding the <inlineequation> <m:math display="inline"> <m:msup> <m:mi>2</m:mi> - <m:mrow> - <m:mi>32</m:mi> - </m:mrow> + <m:mi>32</m:mi> </m:msup> </m:math> - </inlineequation> different values. Thus even mapping just one- - and two-<xref linkend="glo_unicode"/> character strings exceeds - the number of different <code language="java">int</code> values - thus requiring different string instances being mapped to - identical hash values. Consider for example:</para> + </inlineequation> count of different <code + language="java">int</code> values. Thus hash value clashes are + inevitable.</para> + + <para>The <xref linkend="glo_JDK"/>'s <link + xlink:href="https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/String.html#hashCode()">String.hashCode()</link> + implementation already reveals conflicts for <xref + linkend="glo_ASCII"/> strings of length 2:</para> <informaltable border="1"> <tr> @@ -691,10 +742,10 @@ public boolean equals(Object anObject) { </tr> <tr> - <td valign="top"><programlisting language="java">System.out.println("hashcode of AA: " + "Aa".hashCode()); + <td valign="top"><programlisting language="java">System.out.println("hashcode of Aa: " + "Aa".hashCode()); System.out.println("hashcode of BB: " + "BB".hashCode());</programlisting></td> - <td valign="top"><screen>hashcode of AA: 2112 + <td valign="top"><screen>hashcode of Aa: 2112 hashcode of BB: 2112</screen></td> </tr> </informaltable> -- GitLab