\begin{table}[h] \centering \caption{NOIP Dataset Results} \begin{tabular}{l l c c c c} \hline \textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\ \hline Claude-3.7-sonnet & valid test case generation & 0.8693 & 0.9219 & 0.9302 & 0.9406 \\ & targeted test case generation (without target instruction) & 0.3992 & 0.5675 & 0.6373 & 0.7201 \\ & targeted test case generation (with target instruction) & 0.6848 & 0.79 & 0.8211 & 0.8485 \\ Claude-4-sonnet & valid test case generation & 0.8804 & 0.9121 & 0.9206 & 0.9314 \\ & targeted test case generation (without target instruction) & 0.3777 & 0.5321 & 0.5937 & 0.6543 \\ & targeted test case generation (with target instruction) & 0.6652 & 0.7648 & 0.7993 & 0.8333 \\ \hline \end{tabular} \end{table} \begin{table}[h] \centering \caption{Pass@1 Performance by Error Type} \subtable[With target instruction]{ \begin{tabular}{l c c c c} \hline \textbf{Error Type} & \textbf{GPT-4o} & \textbf{o1-mini} & \textbf{Qwen-Max} & \textbf{Claude-3.7-sonnet} \\ \hline Logic & 0.8064 & 0.686 & 0.76 & 0.6252 \\ Array Size & 0.7083 & 0.7263 & 0.8 & 0.7533 \\ Time Complexity & 0.736 & 0.778 & 0.864 & 0.7 \\ Corner Case & 0.74 & 0.7297 & 0.725 & 0.7593 \\ Implementation & 0.755 & 0.757 & 0.51 & 0.84 \\ Integer Overflow & 0.7625 & 0.7125 & 1 & 0.75 \\ Type Cast & 0.7 & 0.96 & 0.72 & 0.9 \\ Others & 0.7443 & 0.8647 & 0.75 & 0.7167 \\ \hline \end{tabular} } \quad \subtable[Without target instruction]{ \begin{tabular}{l c c c c} \hline \textbf{Error Type} & \textbf{GPT-4o} & \textbf{o1-mini} & \textbf{Qwen-Max} & \textbf{Claude-3.7-sonnet} \\ \hline Logic & 0.4044 & 0.62 & 0.3667 & 0.5 \\ Array Size & 0.38 & 0.7175 & 0.32 & 0.4644 \\ Time Complexity & 0.436 & 0.672 & 0.36 & 0.566 \\ Corner Case & 0.364 & 0.724 & 0.276 & 0.266 \\ Implementation & 0.5233 & 0.5133 & 0.4167 & 0.3352 \\ Integer Overflow & 0.66 & 0.528 & 0.396 & 0.084 \\ Type Cast & 0.22 & 0.42 & 0.4 & 0.12 \\ Others & 0.3733 & 0.6867 & 0.2667 & 0.5467 \\ \hline \end{tabular} } \end{table} \begin{table}[h] \centering \caption{Targeted Test Case Generation Results} \begin{tabular}{l l c c c c} \hline \textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\ \hline DeepSeek-V3 & targeted test case generation (without target instruction) & 0.4453 & 0.6846 & 0.7703 & 0.8302 \\ & targeted test case generation (with target instruction) & 0.7941 & 0.8549 & 0.8693 & 0.8824 \\ o1-Mini-Intpm & targeted test case generation (without target instruction) & 0.6945 & 0.8795 & 0.9119 & 0.9273 \\ & targeted test case generation (with target instruction) & 0.9284 & 0.9338 & 0.9782 & 1.0 \\ Claude-3.7-sonnet & targeted test case generation (without target instruction) & 0.3957 & 0.5746 & 0.6383 & 0.6957 \\ & targeted test case generation (with target instruction) & 0.7882 & 0.9054 & 0.9346 & 0.9412 \\ Qwen-Max & targeted test case generation (with target instruction) & 0.8176 & 0.8588 & 1.0 & 0.9412 \\ & targeted test case generation (without target instruction) & 0.3218 & 0.5817 & 0.6967 & 0.8364 \\ \hline \end{tabular} \end{table} \begin{table}[h] \centering \caption{Qwen-2.5-14B Finetuned Results} \begin{tabular}{l l c c c c} \hline \textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\ \hline Qwen-2.5-14B Finetuned & basic & 0.744 & 0.9549 & 0.9797 & 1.0 \\ & target (without target instruction) & 0.3198 & 0.6469 & 0.6749 & 0.7929 \\ & target (with target instruction) & 0.6136 & 0.7778 & 0.8072 & 0.8333 \\ \hline \end{tabular} \end{table}