diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py
index e6b7d365237ad9221fd85c3890ceb4a192640e1c..033f359d5374fdd89dce6ed72b51e010dc8c9928 100644
--- a/evoprompt/evolution/evolution.py
+++ b/evoprompt/evolution/evolution.py
@@ -249,6 +249,7 @@ class GeneticAlgorithm(EvolutionAlgorithm):
             self.evolution_model.create_completion(
                 system_message=SYSTEM_MESSAGE,
                 prompt=filled_prompt,
+                enforce_randomness=True,
             )
         )
 
@@ -351,6 +352,7 @@ class DifferentialEvolution(EvolutionAlgorithm):
             self.evolution_model.create_completion(
                 system_message=SYSTEM_MESSAGE,
                 prompt=filled_prompt,
+                enforce_randomness=True,
             )
         )
 
@@ -468,6 +470,7 @@ class DifferentialEvolutionWithCot(DifferentialEvolution):
                     prompt=filled_prompt,
                     history=history,
                     stop="</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None,
+                    enforce_randomness=True,
                 )
             )
             logger.debug(
diff --git a/evoprompt/models.py b/evoprompt/models.py
index 34eb729fee3fa07464e007a872320c4718157520..9380a073b0164ad5dc0b5b983253ea7c6d2145b3 100644
--- a/evoprompt/models.py
+++ b/evoprompt/models.py
@@ -275,7 +275,8 @@ class Llama(LLMModel):
             "max_tokens": max_tokens,
         }
         if enforce_randomness:
-            model_call_kwargs["temperature"] = 2.0
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
             model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
         else:
             model_call_kwargs["temperature"] = 0.0
@@ -398,7 +399,8 @@ class LlamaChat(ChatModel, Llama):
             "max_tokens": max_tokens,
         }
         if enforce_randomness:
-            model_call_kwargs["temperature"] = 2.0
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
             model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
         else:
             model_call_kwargs["temperature"] = 0.0
@@ -478,7 +480,8 @@ class HfChat(ChatModel, LLMModel):
             "max_length": max_tokens if max_tokens is not None else 2048,
         }
         if enforce_randomness:
-            model_call_kwargs["temperature"] = 2.0
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
             model_call_kwargs["do_sample"] = True
         else:
             model_call_kwargs["do_sample"] = False
@@ -575,7 +578,8 @@ class OpenAIChat(ChatModel, LLMModel):
             "max_completion_tokens": max_tokens if max_tokens is not None else 1024,
         }
         if enforce_randomness:
-            model_call_kwargs["temperature"] = 2.0
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
         else:
             model_call_kwargs["temperature"] = 0.0
 
diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py
index cfac945fbfacc200741eb704272825d16c1c0f59..b073a875984ed8cfa5f6942aa13064ed1f27e2c0 100644
--- a/evoprompt/task/task.py
+++ b/evoprompt/task/task.py
@@ -415,7 +415,7 @@ class Task(metaclass=ABCMeta):
             # we use cached completions to speed up the process although we loose the non-deterministic behavior of LMs, but we're ok with a single result
             use_cache=True,
             # use less randomness, i.e., more certain outputs
-            temperature=0.0,
+            enforce_randomness=False,
         )
 
         if not self.use_grammar: