[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Plan C (was Re: Plan B for fixing 5.8.2 binary API)



On Thu, Oct 16, 2003 at 11:58:11AM -0400, Chip Salzenberg wrote:
> I can see a minimally intrusive implementation with a small code
> delta.  If it can't be done with stability and certainty, I'll line
> up right behind you to shoot it in the head.

Something like this? Only with the rough edges smoothed out?

I don't think that it really needs two set variables, given that we
don't set the 5.8.1 hash seed anymore. Also, MJD's calculations
convince me that the threshold for changing strategy is too conservative,
and that it might be better to initiate a split in the store routines
based on linked list length, rather than buckets used.

Currently only in blead.

Nicholas Clark

Change 21471 by nicholas@entropy on 2003/10/16 21:10:27

	Plan C for foiling the algorithmic complexity attack
	(based on Chip's plan A (binary compatibility with 5.8.0 and 5.8.1),
	 Chip's plan B (do something new inside the hv functions)
	 and introspective sort)
	Provides infrastructure for hashes to change their hash function
	if necessary, and code in hsplit to detect pathalogical data and
	instigate a random rehashing.
	Needs refinement. Let's see how much smoke it creates.

Affected files ...

... //depot/perl/embedvar.h#181 edit
... //depot/perl/hv.c#139 edit
... //depot/perl/hv.h#43 edit
... //depot/perl/intrpvar.h#134 edit
... //depot/perl/perl.c#527 edit
... //depot/perl/perlapi.h#103 edit
... //depot/perl/sv.c#691 edit
... //depot/perl/sv.h#149 edit
... //depot/perl/util.c#405 edit

Differences ...

==== //depot/perl/embedvar.h#181 (text+w) ====

@@ -320,6 +320,8 @@
 #define PL_multi_open		(vTHX->Imulti_open)
 #define PL_multi_start		(vTHX->Imulti_start)
 #define PL_multiline		(vTHX->Imultiline)
+#define PL_new_hash_seed	(vTHX->Inew_hash_seed)
+#define PL_new_hash_seed_set	(vTHX->Inew_hash_seed_set)
 #define PL_nexttoke		(vTHX->Inexttoke)
 #define PL_nexttype		(vTHX->Inexttype)
 #define PL_nextval		(vTHX->Inextval)
@@ -624,6 +626,8 @@
 #define PL_Imulti_open		PL_multi_open
 #define PL_Imulti_start		PL_multi_start
 #define PL_Imultiline		PL_multiline
+#define PL_Inew_hash_seed	PL_new_hash_seed
+#define PL_Inew_hash_seed_set	PL_new_hash_seed_set
 #define PL_Inexttoke		PL_nexttoke
 #define PL_Inexttype		PL_nexttype
 #define PL_Inextval		PL_nextval

==== //depot/perl/hv.c#139 (text) ====

@@ -274,7 +274,11 @@
         }
     }
 
-    PERL_HASH(hash, key, klen);
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else {
+	PERL_HASH(hash, key, klen);
+    }
 
     /* entry = (HvARRAY(hv))[hash & (I32) HvMAX(hv)]; */
     entry = ((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
@@ -445,7 +449,9 @@
             flags |= HVhek_WASUTF8 | HVhek_FREEKEY;
     }
 
-    if (!hash) {
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else if (!hash) {
         if SvIsCOW_shared_hash(keysv) {
             hash = SvUVX(keysv);
         } else {
@@ -628,7 +634,12 @@
     if (flags)
         HvHASKFLAGS_on((SV*)hv);
 
-    if (!hash)
+    if (HvREHASH(hv)) {
+	/* We don't have a pointer to the hv, so we have to replicate the
+	   flag into every HEK, so that hv_iterkeysv can see it.  */
+	flags |= HVhek_REHASH;
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else if (!hash)
 	PERL_HASH(hash, key, klen);
 
     if (!xhv->xhv_array /* !HvARRAY(hv) */)
@@ -798,7 +809,12 @@
         HvHASKFLAGS_on((SV*)hv);
     }
 
-    if (!hash) {
+    if (HvREHASH(hv)) {
+	/* We don't have a pointer to the hv, so we have to replicate the
+	   flag into every HEK, so that hv_iterkeysv can see it.  */
+	flags |= HVhek_REHASH;
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else if (!hash) {
         if SvIsCOW_shared_hash(keysv) {
             hash = SvUVX(keysv);
         } else {
@@ -950,7 +966,11 @@
             k_flags |= HVhek_FREEKEY;
     }
 
-    PERL_HASH(hash, key, klen);
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else {
+	PERL_HASH(hash, key, klen);
+    }
 
     /* oentry = &(HvARRAY(hv))[hash & (I32) HvMAX(hv)]; */
     oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
@@ -1107,8 +1127,11 @@
             k_flags |= HVhek_FREEKEY;
     }
 
-    if (!hash)
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else if (!hash) {
 	PERL_HASH(hash, key, klen);
+    }
 
     /* oentry = &(HvARRAY(hv))[hash & (I32) HvMAX(hv)]; */
     oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
@@ -1255,7 +1278,11 @@
             k_flags |= HVhek_FREEKEY;
     }
 
-    PERL_HASH(hash, key, klen);
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else {
+	PERL_HASH(hash, key, klen);
+    }
 
 #ifdef DYNAMIC_ENV_FETCH
     if (!xhv->xhv_array /* !HvARRAY(hv) */) entry = Null(HE*);
@@ -1359,7 +1386,9 @@
         if (key != keysave)
             k_flags |= HVhek_FREEKEY;
     }
-    if (!hash)
+    if (HvREHASH(hv)) {
+	PERL_HASH_INTERNAL(hash, key, klen);
+    } else if (!hash)
 	PERL_HASH(hash, key, klen);
 
 #ifdef DYNAMIC_ENV_FETCH
@@ -1415,6 +1444,8 @@
     register HE **bep;
     register HE *entry;
     register HE **oentry;
+    int longest_chain = 0;
+    int was_shared;
 
     PL_nomemok = TRUE;
 #if defined(STRANGE_MALLOC) || defined(MYMALLOC)
@@ -1445,6 +1476,9 @@
     aep = (HE**)a;
 
     for (i=0; i<oldsize; i++,aep++) {
+	int left_length = 0;
+	int right_length = 0;
+
 	if (!*aep)				/* non-existent */
 	    continue;
 	bep = aep+oldsize;
@@ -1455,14 +1489,90 @@
 		if (!*bep)
 		    xhv->xhv_fill++; /* HvFILL(hv)++ */
 		*bep = entry;
+		right_length++;
 		continue;
 	    }
-	    else
+	    else {
 		oentry = &HeNEXT(entry);
+		left_length++;
+	    }
 	}
 	if (!*aep)				/* everything moved */
 	    xhv->xhv_fill--; /* HvFILL(hv)-- */
+	/* I think we don't actually need to keep track of the longest length,
+	   merely flag if anything is too long. But for the moment while
+	   developing this code I'll track it.  */
+	if (left_length > longest_chain)
+	    longest_chain = left_length;
+	if (right_length > longest_chain)
+	    longest_chain = right_length;
     }
+
+
+    /* Pick your policy for "hashing isn't working" here:  */
+    if (longest_chain < 8 || longest_chain * 2 < HvTOTALKEYS(hv)
+	|| HvREHASH(hv)) {
+	return;
+    }
+
+    if (hv == PL_strtab) {
+	/* Urg. Someone is doing something nasty to the string table.
+	   Can't win.  */
+	return;
+    }
+
+    /* Awooga. Awooga. Pathological data.  */
+    /*PerlIO_printf(PerlIO_stderr(), "Awooga %d of %d with %d/%d buckets\n",
+      longest_chain, HvTOTALKEYS(hv), HvFILL(hv),  1+HvMAX(hv));*/
+
+    ++newsize;
+    Newz(2, a, PERL_HV_ARRAY_ALLOC_BYTES(newsize), char);
+    was_shared = HvSHAREKEYS(hv);
+
+    xhv->xhv_fill = 0;
+    HvSHAREKEYS_off(hv);
+    HvREHASH_on(hv);
+
+    aep = (HE **) xhv->xhv_array;
+
+    for (i=0; i<newsize; i++,aep++) {
+	entry = *aep;
+	while (entry) {
+	    /* We're going to trash this HE's next pointer when we chain it
+	       into the new hash below, so store where we go next.  */
+	    HE *next = HeNEXT(entry);
+	    UV hash;
+
+	    /* Rehash it */
+	    PERL_HASH_INTERNAL(hash, HeKEY(entry), HeKLEN(entry));
+
+	    if (was_shared) {
+		/* Unshare it.  */
+		HEK *new_hek
+		    = save_hek_flags(HeKEY(entry), HeKLEN(entry),
+				     hash, HeKFLAGS(entry));
+		unshare_hek (HeKEY_hek(entry));
+		HeKEY_hek(entry) = new_hek;
+	    } else {
+		/* Not shared, so simply write the new hash in. */
+		HeHASH(entry) = hash;
+	    }
+	    /*PerlIO_printf(PerlIO_stderr(), "%d ", HeKFLAGS(entry));*/
+	    HEK_REHASH_on(HeKEY_hek(entry));
+	    /*PerlIO_printf(PerlIO_stderr(), "%d\n", HeKFLAGS(entry));*/
+
+	    /* Copy oentry to the correct new chain.  */
+	    bep = ((HE**)a) + (hash & (I32) xhv->xhv_max);
+	    if (!*bep)
+		    xhv->xhv_fill++; /* HvFILL(hv)++ */
+	    HeNEXT(entry) = *bep;
+	    *bep = entry;
+
+	    entry = next;
+	}
+    }
+    Safefree (xhv->xhv_array);
+    xhv->xhv_array = a;		/* HvARRAY(hv) = a */
 }
 
 void
@@ -1566,6 +1676,7 @@
 #ifndef NODEFAULT_SHAREKEYS
     HvSHAREKEYS_on(hv);         /* key-sharing on by default */
 #endif
+
     xhv->xhv_max    = 7;	/* HvMAX(hv) = 7 (start with 8 buckets) */
     xhv->xhv_fill   = 0;	/* HvFILL(hv) = 0 */
     xhv->xhv_pmroot = 0;	/* HvPMROOT(hv) = 0 */
@@ -2039,7 +2150,17 @@
             sv = newSVpvn ((char*)as_utf8, utf8_len);
             SvUTF8_on (sv);
 	    Safefree (as_utf8); /* bytes_to_utf8() allocates a new string */
-        } else {
+	} else if (flags & HVhek_REHASH) {
+	    /* We don't have a pointer to the hv, so we have to replicate the
+	       flag into every HEK. This hv is using custom a hasing
+	       algorithm. Hence we can't return a shared string scalar, as
+	       that would contain the (wrong) hash value, and might get passed
+	       into an hv routine with a regular hash  */
+
+            sv = newSVpvn (HEK_KEY(hek), HEK_LEN(hek));
+	    if (HEK_UTF8(hek))
+		SvUTF8_on (sv);
+	} else {
             sv = newSVpvn_share(HEK_KEY(hek),
                                 (HEK_UTF8(hek) ? -HEK_LEN(hek) : HEK_LEN(hek)),
                                 HEK_HASH(hek));

==== //depot/perl/hv.h#43 (text) ====

@@ -89,6 +89,24 @@
 	(hash) = (hash_PeRlHaSh + (hash_PeRlHaSh << 15)); \
     } STMT_END
 
+#ifdef PERL_IN_HV_C
+#define PERL_HASH_INTERNAL(hash,str,len) \
+     STMT_START	{ \
+	register const char *s_PeRlHaSh_tmp = str; \
+	register const unsigned char *s_PeRlHaSh = (const unsigned char *)s_PeRlHaSh_tmp; \
+	register I32 i_PeRlHaSh = len; \
+	register U32 hash_PeRlHaSh = PL_new_hash_seed; \
+	while (i_PeRlHaSh--) { \
+	    hash_PeRlHaSh += *s_PeRlHaSh++; \
+	    hash_PeRlHaSh += (hash_PeRlHaSh << 10); \
+	    hash_PeRlHaSh ^= (hash_PeRlHaSh >> 6); \
+	} \
+	hash_PeRlHaSh += (hash_PeRlHaSh << 3); \
+	hash_PeRlHaSh ^= (hash_PeRlHaSh >> 11); \
+	(hash) = (hash_PeRlHaSh + (hash_PeRlHaSh << 15)); \
+    } STMT_END
+#endif
+
 /*
 =head1 Hash Manipulation Functions
 
@@ -203,6 +221,10 @@
 #define HvLAZYDEL_on(hv)	(SvFLAGS(hv) |= SVphv_LAZYDEL)
 #define HvLAZYDEL_off(hv)	(SvFLAGS(hv) &= ~SVphv_LAZYDEL)
 
+#define HvREHASH(hv)		(SvFLAGS(hv) & SVphv_REHASH)
+#define HvREHASH_on(hv)		(SvFLAGS(hv) |= SVphv_REHASH)
+#define HvREHASH_off(hv)	(SvFLAGS(hv) &= ~SVphv_REHASH)
+
 /* Maybe amagical: */
 /* #define HV_AMAGICmb(hv)      (SvFLAGS(hv) & (SVpgv_badAM | SVpgv_AM)) */
 
@@ -224,6 +246,7 @@
 #define HeKLEN(he)		HEK_LEN(HeKEY_hek(he))
 #define HeKUTF8(he)  HEK_UTF8(HeKEY_hek(he))
 #define HeKWASUTF8(he)  HEK_WASUTF8(HeKEY_hek(he))
+#define HeKREHASH(he)  HEK_REHASH(HeKEY_hek(he))
 #define HeKLEN_UTF8(he)  (HeKUTF8(he) ? -HeKLEN(he) : HeKLEN(he))
 #define HeKFLAGS(he)  HEK_FLAGS(HeKEY_hek(he))
 #define HeVAL(he)		(he)->hent_val
@@ -254,6 +277,7 @@
 
 #define HVhek_UTF8	0x01 /* Key is utf8 encoded. */
 #define HVhek_WASUTF8	0x02 /* Key is bytes here, but was supplied as utf8. */
+#define HVhek_REHASH	0x04 /* This key is in an hv using a custom HASH . */
 #define HVhek_FREEKEY	0x100 /* Internal flag to say key is malloc()ed.  */
 #define HVhek_PLACEHOLD	0x200 /* Internal flag to create placeholder.
                                * (may change, but Storable is a core module) */
@@ -265,6 +289,8 @@
 #define HEK_WASUTF8(hek)	(HEK_FLAGS(hek) & HVhek_WASUTF8)
 #define HEK_WASUTF8_on(hek)	(HEK_FLAGS(hek) |= HVhek_WASUTF8)
 #define HEK_WASUTF8_off(hek)	(HEK_FLAGS(hek) &= ~HVhek_WASUTF8)
+#define HEK_REHASH(hek)		(HEK_FLAGS(hek) & HVhek_REHASH)
+#define HEK_REHASH_on(hek)	(HEK_FLAGS(hek) |= HVhek_REHASH)
 
 /* calculate HV array allocation */
 #if defined(STRANGE_MALLOC) || defined(MYMALLOC)

==== //depot/perl/intrpvar.h#134 (text) ====

@@ -531,6 +531,10 @@
 
 PERLVARI(Icv_has_eval, I32, 0) /* PL_compcv includes an entereval or similar */
 
+PERLVARI(Inew_hash_seed, UV, 0)		/* 582 hash initializer */
+
+PERLVARI(Inew_hash_seed_set, bool, FALSE)	/* 582 hash initialized? */
+
 /* New variables must be added to the very end, before this comment,
  * for binary compatibility (the offsets of the old members must not change).
  * (Don't forget to add your variable also to perl_clone()!)

==== //depot/perl/perl.c#527 (text) ====

@@ -918,7 +918,7 @@
      * it is your responsibility to provide a good random seed!
      * You can also define PERL_HASH_SEED in compile time, see hv.h. */
     if (!PL_hash_seed_set)
-	 PL_hash_seed = get_hash_seed();
+	 PL_new_hash_seed = get_hash_seed();
     {
 	 char *s = PerlEnv_getenv("PERL_HASH_SEED_DEBUG");
 
@@ -927,7 +927,7 @@
 
 	      if (i == 1)
 		   PerlIO_printf(Perl_debug_log, "HASH_SEED = %"UVuf"\n",
-				 PL_hash_seed);
+				 PL_new_hash_seed);
 	 }
     }
 #endif /* #if defined(USE_HASH_SEED) || defined(USE_HASH_SEED_EXPLICIT) */

==== //depot/perl/perlapi.h#103 (text+w) ====

@@ -398,6 +398,10 @@
 #define PL_multi_start		(*Perl_Imulti_start_ptr(aTHX))
 #undef  PL_multiline
 #define PL_multiline		(*Perl_Imultiline_ptr(aTHX))
+#undef  PL_new_hash_seed
+#define PL_new_hash_seed	(*Perl_Inew_hash_seed_ptr(aTHX))
+#undef  PL_new_hash_seed_set
+#define PL_new_hash_seed_set	(*Perl_Inew_hash_seed_set_ptr(aTHX))
 #undef  PL_nexttoke
 #define PL_nexttoke		(*Perl_Inexttoke_ptr(aTHX))
 #undef  PL_nexttype

==== //depot/perl/sv.c#691 (text) ====

@@ -11349,6 +11349,7 @@
     PL_glob_index	= proto_perl->Iglob_index;
     PL_srand_called	= proto_perl->Isrand_called;
     PL_hash_seed	= proto_perl->Ihash_seed;
+    PL_new_hash_seed	= proto_perl->Inew_hash_seed;
     PL_uudmap['M']	= 0;		/* reinits on demand */
     PL_bitcount		= Nullch;	/* reinits on demand */
 

==== //depot/perl/sv.h#149 (text) ====

@@ -213,6 +213,7 @@
 
 #define SVrepl_EVAL	0x40000000	/* Replacement part of s///e */
 
+#define SVphv_REHASH	0x10000000	/* HV is recalculating hash values */
 #define SVphv_SHAREKEYS 0x20000000	/* keys live on shared string table */
 #define SVphv_LAZYDEL	0x40000000	/* entry in xhv_eiter must be deleted */
 #define SVphv_HASKFLAGS	0x80000000	/* keys have flag byte after hash */

==== //depot/perl/util.c#405 (text) ====

@@ -4427,7 +4427,7 @@
 		  Perl_croak(aTHX_ "Your random numbers are not that random");
 	  }
      }
-     PL_hash_seed_set = TRUE;
+     PL_new_hash_seed_set = TRUE;
 
      return myseed;
 }



Reply to: