\begin{table}[h]
\centering
\caption{NOIP Dataset Results}
\begin{tabular}{l l c c c c}
\hline
\textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\
\hline
Claude-3.7-sonnet & valid test case generation & 0.8693 & 0.9219 & 0.9302 & 0.9406 \\
& targeted test case generation (without target instruction) & 0.3992 & 0.5675 & 0.6373 & 0.7201 \\
& targeted test case generation (with target instruction) & 0.6848 & 0.79 & 0.8211 & 0.8485 \\
Claude-4-sonnet & valid test case generation & 0.8804 & 0.9121 & 0.9206 & 0.9314 \\
& targeted test case generation (without target instruction) & 0.3777 & 0.5321 & 0.5937 & 0.6543 \\
& targeted test case generation (with target instruction) & 0.6652 & 0.7648 & 0.7993 & 0.8333 \\
\hline
\end{tabular}
\end{table}

\begin{table}[h]
\centering
\caption{Pass@1 Performance by Error Type}

\subtable[With target instruction]{
\begin{tabular}{l c c c c}
\hline
\textbf{Error Type} & \textbf{GPT-4o} & \textbf{o1-mini} & \textbf{Qwen-Max} & \textbf{Claude-3.7-sonnet} \\
\hline
Logic & 0.8064 & 0.686 & 0.76 & 0.6252 \\
Array Size & 0.7083 & 0.7263 & 0.8 & 0.7533 \\
Time Complexity & 0.736 & 0.778 & 0.864 & 0.7 \\
Corner Case & 0.74 & 0.7297 & 0.725 & 0.7593 \\
Implementation & 0.755 & 0.757 & 0.51 & 0.84 \\
Integer Overflow & 0.7625 & 0.7125 & 1 & 0.75 \\
Type Cast & 0.7 & 0.96 & 0.72 & 0.9 \\
Others & 0.7443 & 0.8647 & 0.75 & 0.7167 \\
\hline
\end{tabular}
}
\quad
\subtable[Without target instruction]{
\begin{tabular}{l c c c c}
\hline
\textbf{Error Type} & \textbf{GPT-4o} & \textbf{o1-mini} & \textbf{Qwen-Max} & \textbf{Claude-3.7-sonnet} \\
\hline
Logic & 0.4044 & 0.62 & 0.3667 & 0.5 \\
Array Size & 0.38 & 0.7175 & 0.32 & 0.4644 \\
Time Complexity & 0.436 & 0.672 & 0.36 & 0.566 \\
Corner Case & 0.364 & 0.724 & 0.276 & 0.266 \\
Implementation & 0.5233 & 0.5133 & 0.4167 & 0.3352 \\
Integer Overflow & 0.66 & 0.528 & 0.396 & 0.084 \\
Type Cast & 0.22 & 0.42 & 0.4 & 0.12 \\
Others & 0.3733 & 0.6867 & 0.2667 & 0.5467 \\
\hline
\end{tabular}
}
\end{table}

\begin{table}[h]
\centering
\caption{Targeted Test Case Generation Results}
\begin{tabular}{l l c c c c}
\hline
\textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\
\hline
DeepSeek-V3 & targeted test case generation (without target instruction) & 0.4453 & 0.6846 & 0.7703 & 0.8302 \\
& targeted test case generation (with target instruction) & 0.7941 & 0.8549 & 0.8693 & 0.8824 \\
o1-Mini-Intpm & targeted test case generation (without target instruction) & 0.6945 & 0.8795 & 0.9119 & 0.9273 \\
& targeted test case generation (with target instruction) & 0.9284 & 0.9338 & 0.9782 & 1.0 \\
Claude-3.7-sonnet & targeted test case generation (without target instruction) & 0.3957 & 0.5746 & 0.6383 & 0.6957 \\
& targeted test case generation (with target instruction) & 0.7882 & 0.9054 & 0.9346 & 0.9412 \\
Qwen-Max & targeted test case generation (with target instruction) & 0.8176 & 0.8588 & 1.0 & 0.9412 \\
& targeted test case generation (without target instruction) & 0.3218 & 0.5817 & 0.6967 & 0.8364 \\
\hline
\end{tabular}
\end{table}

\begin{table}[h]
\centering
\caption{Qwen-2.5-14B Finetuned Results}
\begin{tabular}{l l c c c c}
\hline
\textbf{Model} & \textbf{Task} & \textbf{Pass@1} & \textbf{Pass@3} & \textbf{Pass@5} & \textbf{Pass@10} \\
\hline
Qwen-2.5-14B Finetuned & basic & 0.744 & 0.9549 & 0.9797 & 1.0 \\
& target (without target instruction) & 0.3198 & 0.6469 & 0.6749 & 0.7929 \\
& target (with target instruction) & 0.6136 & 0.7778 & 0.8072 & 0.8333 \\
\hline
\end{tabular}
\end{table}